Milimetric has submitted this change and it was merged.
Change subject: Exclude moderated data
......................................................................
Exclude moderated data
We've been seeing a big increase in active boards/topics/replies/users,
but there's also been a lot of spam lately (as also evidenced by the results
of moderation-actions.sql)
We don't currently know exactly how much of the increase was spam, making it
hard to figure out the impact of the recently deployed opt-in feature.
This will omit data from topics & posts that have been moderated.
Notice how the queries in active-boards.sql & active-topics.sql have changed
drastically. I'm unsure why the complex inner queries were needed - I may have
overlooked something...
Bug: T116797
Change-Id: I3cc6bd45ff5ec684bb7463cf762b9e78059e23a4
---
M flow/active-boards.sql
M flow/active-topics.sql
M flow/messages-posted.sql
M flow/unique-users.sql
4 files changed, 49 insertions(+), 49 deletions(-)
Approvals:
Milimetric: Verified; Looks good to me, approved
diff --git a/flow/active-boards.sql b/flow/active-boards.sql
index 89116f5..837798d 100644
--- a/flow/active-boards.sql
+++ b/flow/active-boards.sql
@@ -1,16 +1,13 @@
-SELECT DATE('{from_timestamp}') as weekstart,
- count(distinct workflow_wiki, workflow_namespace, workflow_title_text)
as num_boards
- FROM flow_workflow
- JOIN (
- SELECT a.tree_ancestor_id,
-
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.tree_descendant_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
as timestamp
- FROM flow_tree_node a
- JOIN (
- SELECT b.tree_descendant_id, MAX(b.tree_depth) as max
- FROM flow_tree_node b
- GROUP BY b.tree_descendant_id
- ) y ON y.max = a.tree_depth AND y.tree_descendant_id =
a.tree_descendant_id
- ) z ON z.tree_ancestor_id = workflow_id
- WHERE workflow_wiki NOT IN ( 'testwiki', 'test2wiki' )
- AND timestamp >= '{from_timestamp}'
- AND timestamp < '{to_timestamp}';
+SELECT DATE('{from_timestamp}') AS weekstart,
+ COUNT(DISTINCT workflow_wiki, workflow_namespace, workflow_title_text)
AS num_boards
+FROM
+(
+ SELECT *
+ FROM flow_workflow
+ INNER JOIN flow_revision ON rev_type_id = workflow_id AND rev_type =
'post' # join needed to be able to exclude moderated topics
+ WHERE
+ workflow_wiki NOT IN ('testwiki', 'test2wiki') AND
+ workflow_type = 'topic' AND
+
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(workflow_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+ GROUP BY rev_type, rev_type_id HAVING
SUBSTRING_INDEX(GROUP_CONCAT(rev_change_type), ',', -1) NOT IN ('hide-topic',
'delete-topic', 'suppress-topic') # exclude topics where last revision was
moderation
+) AS temp;
diff --git a/flow/active-topics.sql b/flow/active-topics.sql
index 71e8073..7f589ce 100644
--- a/flow/active-topics.sql
+++ b/flow/active-topics.sql
@@ -1,16 +1,13 @@
-SELECT DATE('{from_timestamp}') as weekstart,
- count(distinct workflow_id) as num_topics
- FROM flow_workflow
- JOIN (
- SELECT a.tree_ancestor_id,
-
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.tree_descendant_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
as timestamp
- FROM flow_tree_node a
- JOIN (
- SELECT b.tree_descendant_id, MAX(b.tree_depth) as max
- FROM flow_tree_node b
- GROUP BY b.tree_descendant_id
- ) y ON y.max = a.tree_depth AND y.tree_descendant_id =
a.tree_descendant_id
- ) z ON z.tree_ancestor_id = workflow_id
- WHERE workflow_wiki NOT IN ( 'testwiki', 'test2wiki' )
- AND timestamp >= '{from_timestamp}'
- AND timestamp < '{to_timestamp}';
+SELECT DATE('{from_timestamp}') AS weekstart,
+ COUNT(DISTINCT workflow_id) AS num_topics
+FROM
+(
+ SELECT *
+ FROM flow_workflow
+ INNER JOIN flow_revision ON rev_type_id = workflow_id AND rev_type =
'post' # join needed to be able to exclude moderated topics
+ WHERE
+ workflow_wiki NOT IN ('testwiki', 'test2wiki') AND
+ workflow_type = 'topic' AND
+
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(workflow_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+ GROUP BY rev_type, rev_type_id HAVING
SUBSTRING_INDEX(GROUP_CONCAT(rev_change_type), ',', -1) NOT IN ('hide-topic',
'delete-topic', 'suppress-topic') # exclude topics where last revision was
moderation
+) AS temp;
diff --git a/flow/messages-posted.sql b/flow/messages-posted.sql
index 652bde1..807c6d7 100644
--- a/flow/messages-posted.sql
+++ b/flow/messages-posted.sql
@@ -1,11 +1,14 @@
SELECT
DATE('{from_timestamp}') AS Week,
COUNT(*) AS Replies
-FROM (
- SELECT
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
AS timestamp
- FROM flow_revision
- WHERE rev_change_type = 'reply'
- AND rev_user_wiki NOT IN ('testwiki', 'test2wiki')
- HAVING timestamp >= '{from_timestamp}'
- AND timestamp < '{to_timestamp}'
-) x;
+FROM
+(
+ SELECT 1
+ FROM flow_revision AS a
+ LEFT JOIN flow_revision AS b ON a.rev_type = b.rev_type AND
a.rev_type_id = b.rev_type_id
+ WHERE
+ a.rev_user_wiki NOT IN ('testwiki', 'test2wiki') AND
+ a.rev_change_type = 'reply' AND
+
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+ GROUP BY a.rev_type, a.rev_type_id HAVING
SUBSTRING_INDEX(GROUP_CONCAT(b.rev_change_type), ',', -1) NOT IN ('hide-post',
'delete-post', 'suppress-post') # exclude topics where last revision was
moderation
+) AS temp;
diff --git a/flow/unique-users.sql b/flow/unique-users.sql
index 07df6d9..23e452b 100644
--- a/flow/unique-users.sql
+++ b/flow/unique-users.sql
@@ -2,12 +2,15 @@
DATE('{from_timestamp}') AS Week,
COUNT(DISTINCT rev_user_id, COALESCE(rev_user_ip, ''), rev_user_wiki)
AS "Unique users"
FROM (
- SELECT
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
AS timestamp,
- rev_user_id,
- rev_user_ip,
- rev_user_wiki
- FROM flow_revision
- WHERE rev_user_wiki NOT IN ('testwiki', 'test2wiki')
- HAVING timestamp >= '{from_timestamp}'
- AND timestamp < '{to_timestamp}'
-) x;
+ SELECT
+ a.rev_user_id,
+ a.rev_user_ip,
+ a.rev_user_wiki
+ FROM flow_revision AS a
+ LEFT JOIN flow_revision AS b ON a.rev_type = b.rev_type AND
a.rev_type_id = b.rev_type_id
+ WHERE
+ a.rev_user_wiki NOT IN ('testwiki', 'test2wiki') AND
+ a.rev_change_type = 'reply' AND
+
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+ GROUP BY a.rev_type, a.rev_type_id HAVING
SUBSTRING_INDEX(GROUP_CONCAT(b.rev_change_type), ',', -1) NOT IN ('hide-post',
'delete-post', 'suppress-post') # exclude topics where last revision was
moderation
+) AS temp;
--
To view, visit https://gerrit.wikimedia.org/r/249731
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I3cc6bd45ff5ec684bb7463cf762b9e78059e23a4
Gerrit-PatchSet: 2
Gerrit-Project: analytics/limn-flow-data
Gerrit-Branch: master
Gerrit-Owner: Matthias Mullie <[email protected]>
Gerrit-Reviewer: Matthias Mullie <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Siebrand <[email protected]>
Gerrit-Reviewer: Springle <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits