Joal has submitted this change and it was merged. Change subject: Use webrequest_source text for AppSessionMetrics, mobile is merging with text ......................................................................
Use webrequest_source text for AppSessionMetrics, mobile is merging with text Bug: T122651 Change-Id: I39eb113335303d0df3d2aced007c6decb9c7aa8d --- M refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala M refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala 2 files changed, 13 insertions(+), 11 deletions(-) Approvals: Joal: Looks good to me, approved jenkins-bot: Verified diff --git a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala index 6d6cf00..9ef5b95 100644 --- a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala +++ b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala @@ -186,11 +186,11 @@ /** * Generate list of Parquet file paths over a range of dates - * @param webrequestMobilePath Base path to webrequest mobile parquet data + * @param webrequestTextPath Base path to webrequest text parquet data * @param datesInfo Hashmap with report date related info * @return List of path strings like [".../day=1", ".../day=2"] */ - def dateRangeToPathList(webrequestMobilePath: String, datesInfo: Map[String, Int]): List[String] = { + def dateRangeToPathList(webrequestTextPath: String, datesInfo: Map[String, Int]): List[String] = { //Custom iterator for stepping through LocalDate objects def makeDateRange(from: LocalDate, to: LocalDate, step: Period): Iterator[LocalDate] = Iterator.iterate(from)(_.plus(step)).takeWhile(_.isBefore(to)) @@ -198,7 +198,7 @@ val dateStart = new LocalDate(datesInfo("year"), datesInfo("month"), datesInfo("day")) val dateEnd = dateStart.plusDays(datesInfo("periodDays")) val dateRange = makeDateRange(dateStart, dateEnd, new Period().withDays(1)) - dateRange.toList.map(dt => "%s/year=%d/month=%d/day=%d".format(webrequestMobilePath, dt.getYear, dt.getMonthOfYear, dt.getDayOfMonth)) + dateRange.toList.map(dt => "%s/year=%d/month=%d/day=%d".format(webrequestTextPath, dt.getYear, dt.getMonthOfYear, dt.getDayOfMonth)) } /** @@ -211,7 +211,8 @@ */ def pathListToUuidDataframe(paths: List[String], sqlContext: SQLContext): DataFrame = { sqlContext.parquetFile(paths: _*) - .filter("is_pageview and x_analytics_map['wmfuuid'] is not null and x_analytics_map['wmfuuid'] != ''") + .filter("is_pageview and access_method = 'mobile app' " + + "and x_analytics_map['wmfuuid'] is not null and x_analytics_map['wmfuuid'] != ''") .selectExpr("x_analytics_map['wmfuuid'] as wmfuuid", "CAST(ts AS int) as ts") } @@ -349,12 +350,13 @@ sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy") // Generate a list of all parquet file paths to read given the webrequest base path, - // and all dates related information - val webrequestMobilePath = params.webrequestBasePath + "/webrequest_source=mobile" + // and all dates related information. NOTE: As of January 2016, + // mobile web caches have been merged with text, so webrequest_source=text. + val webrequestTextPath = params.webrequestBasePath + "/webrequest_source=text" // Helper hashmap with all date related information to avoid passing around lots of params val datesInfo = HashMap("year" -> params.year, "month" -> params.month, "day" -> params.day, "periodDays" -> params.periodDays) // List of path strings like [".../day=1", ".../day=2"] - val webrequestPaths = dateRangeToPathList(webrequestMobilePath, datesInfo) + val webrequestPaths = dateRangeToPathList(webrequestTextPath, datesInfo) // Get sessions data for all users, calculate stats for different metrics, // and get the stats in a printable string format to output diff --git a/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala b/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala index 27064ec..254f317 100644 --- a/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala +++ b/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala @@ -34,15 +34,15 @@ test("List of parquet paths is generated correctly based on the report run date and period") { val datesInfo = HashMap("year" -> 2015, "month" -> 5, "day" -> 10, "periodDays" -> 10) - val webrequestMobilePath = ".../webrequest_source=mobile" - val pathList = AppSessionMetrics.dateRangeToPathList(webrequestMobilePath, datesInfo) + val webrequestTextPath = ".../webrequest_source=text" + val pathList = AppSessionMetrics.dateRangeToPathList(webrequestTextPath, datesInfo) //Assert the length of the list equals report period in days assert(pathList.length == datesInfo("periodDays")) //Assert the paths are being generated correctly - assert(pathList.head == ".../webrequest_source=mobile/year=2015/month=5/day=10") - assert(pathList.last == ".../webrequest_source=mobile/year=2015/month=5/day=19") + assert(pathList.head == ".../webrequest_source=text/year=2015/month=5/day=10") + assert(pathList.last == ".../webrequest_source=text/year=2015/month=5/day=19") } } \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/264868 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I39eb113335303d0df3d2aced007c6decb9c7aa8d Gerrit-PatchSet: 3 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: Joal <j...@wikimedia.org> Gerrit-Reviewer: Madhuvishy <mviswanat...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits