Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/187416
to review the following change.
Change subject: Prepare webrequest dump for addition of refined data
......................................................................
Prepare webrequest dump for addition of refined data
Change-Id: Ic998bab779398c3d1d4bc64da2c44c1daf0031c9
---
M bin/refinery-dump-status-webrequest-partitions
1 file changed, 154 insertions(+), 15 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/16/187416/1
diff --git a/bin/refinery-dump-status-webrequest-partitions
b/bin/refinery-dump-status-webrequest-partitions
index 0391fc8..40b652f 100755
--- a/bin/refinery-dump-status-webrequest-partitions
+++ b/bin/refinery-dump-status-webrequest-partitions
@@ -12,6 +12,11 @@
--hdfs-mount MOUNT_POINT
-- Assume that HDFS is mounted at MOUNT_POINT (needs
to be an absolute path) instead of /mnt/hdfs .
+ --datasets DATASET1,DATASET2,...
+ -- Select the datasets to output data for.
+ The following datasets are available:
+ raw_webrequest -- Raw webrequest (hourly)
+ all -- all of the above
--quiet -- Only produce output, if there are faulty partitions
HOURS_TO_GO_BACK -- number of hours to go back in time. (Default: 51)
@@ -29,6 +34,28 @@
QUIET=
QUIET_BUFFER=
+ALL_DATASETS=()
+
+declare -A DATASET_CAPTIONS
+declare -A DATASET_HLINES
+declare -A DATASET_VISIBILITIES
+
+add_dataset() {
+ local DATASET="$1"
+ local DATASET_CAPTION="$2"
+ local DATASET_HLINE="$(sed -e 's/[^|]/-/g; s/|/+/g' <<<"$DATASET_CAPTION")"
+
+ ALL_DATASETS=( "${ALL_DATASETS[@]}" "$DATASET" )
+
+ DATASET_CAPTIONS["$DATASET"]="$DATASET_CAPTION"
+ DATASET_HLINES["$DATASET"]="$DATASET_HLINE"
+ DATASET_VISIBILITIES["$DATASET"]=no
+}
+
+add_dataset "raw_webrequest" " bits | misc | mobile | text | upload |"
+
+DATASET_VISIBILITIES["raw_webrequest"]=yes
+
error() {
echo "Error" "$@" >&2
exit 1
@@ -42,6 +69,44 @@
"--help" | "-h" | "-?" )
print_help
exit 1
+ ;;
+ "--datasets" )
+ [[ $# -gt 0 ]] || error "$PARAM expects a further parameter"
+
+ # Resetting previous visibilities
+ for INNER_DATASET in "${ALL_DATASETS[@]}"
+ do
+ DATASET_VISIBILITIES["$INNER_DATASET"]=no
+ done
+
+ IFS="," read -a DATASETS_SPLIT <<<"$1"
+ for DATASET in "${DATASETS_SPLIT[@]}"
+ do
+ case "$DATASET" in
+ "all" )
+ for INNER_DATASET in "${ALL_DATASETS[@]}"
+ do
+ DATASET_VISIBILITIES["$INNER_DATASET"]=yes
+ done
+ ;;
+ * )
+ FOUND_DATASET=no
+ for INNER_DATASET in "${ALL_DATASETS[@]}"
+ do
+ if [ "$DATASET" = "$INNER_DATASET" ]
+ then
+ DATASET_VISIBILITIES["$DATASET"]=yes
+ FOUND_DATASET=yes
+ fi
+ done
+ if [ "$FOUND_DATASET" != "yes" ]
+ then
+ error "Unknown dataset '$DATASET '"
+ fi
+ ;;
+ esac
+ done
+ shift
;;
"--hdfs-mount" )
[[ $# -gt 0 ]] || error "$PARAM expects a further parameter"
@@ -62,8 +127,8 @@
esac
done
-WEBREQUEST_DATA_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/raw/webrequest"
-WEBREQUEST_STATISTICS_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/raw/webrequests_faulty_hosts"
+RAW_WEBREQUEST_DATA_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/raw/webrequest"
+RAW_WEBREQUEST_STATISTICS_DIR_ABS="$HDFS_MOUNT_DIR_ABS/wmf/data/raw/webrequests_faulty_hosts"
log_no_lf() {
if [ -n "$QUIET" ]
@@ -79,28 +144,88 @@
fi
}
+log_no_lf_centered() {
+ local TEXT="$1"
+ local AVAILABLE_LEN="$2"
+
+ local BLANK_HELPER="
"
+
+ local TEXT_LEN="${#TEXT}"
+
+ log_no_lf "${BLANK_HELPER:0:$(( (AVAILABLE_LEN-TEXT_LEN) / 2 ))}"
+ log_no_lf "$TEXT"
+ log_no_lf "${BLANK_HELPER:0:$(( AVAILABLE_LEN - (AVAILABLE_LEN-TEXT_LEN) /
2 - TEXT_LEN ))}"
+}
+
log() {
log_no_lf "$@
"
}
hline() {
- log " +------------------+--------+--------+--------+--------+--------+"
+ local KIND="$1"
+
+ log_no_lf " +------------------+"
+ for DATASET in "${ALL_DATASETS[@]}"
+ do
+ if [ "${DATASET_VISIBILITIES["$DATASET"]}" = "yes" ]
+ then
+ local DATASET_HLINE="${DATASET_HLINES["$DATASET"]}"
+ if [ "$KIND" = "first" ]
+ then
+ DATASET_HLINE="${DATASET_HLINE//+-/--}"
+ fi
+ log_no_lf "${DATASET_HLINE}"
+ fi
+ done
+ log
}
-log_partition_status() {
+first_caption_line() {
+ local DATASET
+
+ log_no_lf " | |"
+ for DATASET in "${ALL_DATASETS[@]}"
+ do
+ if [ "${DATASET_VISIBILITIES["$DATASET"]}" = "yes" ]
+ then
+ local DATASET_CAPTION="${DATASET_CAPTIONS["$DATASET"]}"
+ local DATASET_CAPTION_LEN="${#DATASET_CAPTION}"
+ log_no_lf_centered "$DATASET" $((DATASET_CAPTION_LEN-1))
+ log_no_lf "|"
+ fi
+ done
+ log
+}
+
+second_caption_line() {
+ local DATASET
+
+ log_no_lf " | Hour |"
+ for DATASET in "${ALL_DATASETS[@]}"
+ do
+ if [ "${DATASET_VISIBILITIES["$DATASET"]}" = "yes" ]
+ then
+ log_no_lf "${DATASET_CAPTIONS["$DATASET"]}"
+ fi
+ done
+ log
+}
+
+dump_dataset_raw_webrequest_partition() {
+
local DATE_HDFS_PADDED="$1"
local SOURCE="$2"
local STATUS="X"
local DATE_HDFS_UNPADDED="${DATE_HDFS_PADDED///0//}"
-
STATISTICS_FILE_ABS="$WEBREQUEST_STATISTICS_DIR_ABS/$SOURCE/$DATE_HDFS_UNPADDED/000000_0"
+
STATISTICS_FILE_ABS="$RAW_WEBREQUEST_STATISTICS_DIR_ABS/$SOURCE/$DATE_HDFS_UNPADDED/000000_0"
if [ -e "$STATISTICS_FILE_ABS" -a ! -s "$STATISTICS_FILE_ABS" ]
then
STATUS="."
else
- if [ -e
"$WEBREQUEST_DATA_DIR_ABS/webrequest_$SOURCE/hourly/$DATE_HDFS_PADDED/_SUCCESS"
]
+ if [ -e
"$RAW_WEBREQUEST_DATA_DIR_ABS/webrequest_$SOURCE/hourly/$DATE_HDFS_PADDED/_SUCCESS"
]
then
STATUS="M"
else
@@ -111,20 +236,34 @@
log_no_lf "$STATUS"
}
-hline
-log " | Date | bits | misc | mobile | text | upload |"
+dump_dataset_raw_webrequest() {
+ local DATE="$1"
+
+ local DATE_HDFS_PADDED="$(date --utc -d "$DATE" +'%Y/%m/%d/%H')"
+
+ for SOURCE in bits misc mobile text upload
+ do
+ log_no_lf " "
+ dump_dataset_raw_webrequest_partition "$DATE_HDFS_PADDED" "$SOURCE"
+ log_no_lf " |"
+ done
+}
+
+hline "first"
+first_caption_line
+second_caption_line
hline
for HOURS_OFFSET in $(seq $HOUR_OFFSET_MAX -1 $HOUR_OFFSET_MIN )
do
- DATE="$(date --utc -d "$HOURS_OFFSET hours-ago" +'%Y-%m-%dT%H/1H')"
- DATE_HDFS_PADDED="$(date --utc -d "$HOURS_OFFSET hours ago"
+'%Y/%m/%d/%H')"
- log_no_lf " | $DATE |"
- for SOURCE in bits misc mobile text upload
+ DATE="$(date --utc -d "$HOURS_OFFSET hours-ago" +'%Y-%m-%d %H')"
+ log_no_lf " | ${DATE// /T}/1H |"
+ for DATASET in "${ALL_DATASETS[@]}"
do
- log_no_lf " "
- log_partition_status "$DATE_HDFS_PADDED" "$SOURCE"
- log_no_lf " |"
+ if [ "${DATASET_VISIBILITIES["$DATASET"]}" = "yes" ]
+ then
+ dump_dataset_$DATASET "$DATE"
+ fi
done
log
done
--
To view, visit https://gerrit.wikimedia.org/r/187416
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic998bab779398c3d1d4bc64da2c44c1daf0031c9
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits