Joal has submitted this change and it was merged.
Change subject: Use webrequest_source text for AppSessionMetrics, mobile is
merging with text
..
Use webrequest_source text for AppSessionMetrics, mobile is merging with text
Bug: T122651
Change-Id: I39eb113335303d0df3d2aced007c6decb9c7aa8d
---
M
refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala
M
refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala
2 files changed, 13 insertions(+), 11 deletions(-)
Approvals:
Joal: Looks good to me, approved
jenkins-bot: Verified
diff --git
a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala
b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala
index 6d6cf00..9ef5b95 100644
---
a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala
+++
b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/AppSessionMetrics.scala
@@ -186,11 +186,11 @@
/**
* Generate list of Parquet file paths over a range of dates
- * @param webrequestMobilePath Base path to webrequest mobile parquet data
+ * @param webrequestTextPath Base path to webrequest text parquet data
* @param datesInfo Hashmap with report date related info
* @return List of path strings like [".../day=1", ".../day=2"]
*/
- def dateRangeToPathList(webrequestMobilePath: String, datesInfo: Map[String,
Int]): List[String] = {
+ def dateRangeToPathList(webrequestTextPath: String, datesInfo: Map[String,
Int]): List[String] = {
//Custom iterator for stepping through LocalDate objects
def makeDateRange(from: LocalDate, to: LocalDate, step: Period):
Iterator[LocalDate] =
Iterator.iterate(from)(_.plus(step)).takeWhile(_.isBefore(to))
@@ -198,7 +198,7 @@
val dateStart = new LocalDate(datesInfo("year"), datesInfo("month"),
datesInfo("day"))
val dateEnd = dateStart.plusDays(datesInfo("periodDays"))
val dateRange = makeDateRange(dateStart, dateEnd, new Period().withDays(1))
-dateRange.toList.map(dt =>
"%s/year=%d/month=%d/day=%d".format(webrequestMobilePath, dt.getYear,
dt.getMonthOfYear, dt.getDayOfMonth))
+dateRange.toList.map(dt =>
"%s/year=%d/month=%d/day=%d".format(webrequestTextPath, dt.getYear,
dt.getMonthOfYear, dt.getDayOfMonth))
}
/**
@@ -211,7 +211,8 @@
*/
def pathListToUuidDataframe(paths: List[String], sqlContext: SQLContext):
DataFrame = {
sqlContext.parquetFile(paths: _*)
- .filter("is_pageview and x_analytics_map['wmfuuid'] is not null and
x_analytics_map['wmfuuid'] != ''")
+ .filter("is_pageview and access_method = 'mobile app' " +
+"and x_analytics_map['wmfuuid'] is not null and
x_analytics_map['wmfuuid'] != ''")
.selectExpr("x_analytics_map['wmfuuid'] as wmfuuid", "CAST(ts AS int) as
ts")
}
@@ -349,12 +350,13 @@
sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy")
// Generate a list of all parquet file paths to read given the
webrequest base path,
-// and all dates related information
-val webrequestMobilePath = params.webrequestBasePath +
"/webrequest_source=mobile"
+// and all dates related information. NOTE: As of January 2016,
+// mobile web caches have been merged with text, so
webrequest_source=text.
+val webrequestTextPath = params.webrequestBasePath +
"/webrequest_source=text"
// Helper hashmap with all date related information to avoid passing
around lots of params
val datesInfo = HashMap("year" -> params.year, "month" ->
params.month, "day" -> params.day, "periodDays" -> params.periodDays)
// List of path strings like [".../day=1", ".../day=2"]
-val webrequestPaths = dateRangeToPathList(webrequestMobilePath,
datesInfo)
+val webrequestPaths = dateRangeToPathList(webrequestTextPath,
datesInfo)
// Get sessions data for all users, calculate stats for different
metrics,
// and get the stats in a printable string format to output
diff --git
a/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala
b/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala
index 27064ec..254f317 100644
---
a/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala
+++
b/refinery-job/src/test/scala/org/wikimedia/analytics/refinery/job/AppSessionSuite.scala
@@ -34,15 +34,15 @@
test("List of parquet paths is generated correctly based on the report run
date and period") {
val datesInfo = HashMap("year" -> 2015, "month" -> 5, "day" -> 10,
"periodDays" -> 10)
-val webrequestMobilePath = ".../webrequest_source=mobile"
-val pathList = AppSessionMetrics.dateRangeToPathList(webrequestMobilePath,
datesInfo)
+val