Milimetric has submitted this change and it was merged.
Change subject: Strip project when uploading cohort
......................................................................
Strip project when uploading cohort
When a cohort is uploaded with blank space between the comma
that separates the username and the project, and the project
itself; centralauth service is not capable of unifying i.e.
'enwiki' and ' enwiki' and this causes a validation error.
This fix adresses that problem and adds a test for the
specific case.
Bug: T78584
Change-Id: I301b8c2cafbf4de98b2fd2c2ead9cd74b254f32f
---
M tests/test_api/test_centralauth_service.py
M tests/test_controllers/test_cohort_upload.py
M tests/test_controllers/test_cohorts.py
M wikimetrics/api/centralauth.py
M wikimetrics/forms/cohort_upload.py
5 files changed, 106 insertions(+), 92 deletions(-)
Approvals:
Milimetric: Looks good to me, approved
jenkins-bot: Verified
diff --git a/tests/test_api/test_centralauth_service.py
b/tests/test_api/test_centralauth_service.py
index 7f85cf5..637ed42 100644
--- a/tests/test_api/test_centralauth_service.py
+++ b/tests/test_api/test_centralauth_service.py
@@ -9,7 +9,6 @@
def setUp(self):
DatabaseTest.setUp(self)
self.expand = CentralAuthService().expand_via_centralauth
- self.default_project = 'defaultproject'
def test_expand_via_centralauth(self):
username_1, username_2 = 'User 1', 'User 2'
@@ -24,30 +23,30 @@
self.caSession.commit()
records = self.expand(
- [[username_1, wiki_1]],
- self.caSession,
- self.default_project
+ [{'username': username_1, 'project': wiki_1}],
+ self.caSession
)
assert_equal([
- [username_1, wiki_1],
- [username_1, wiki_2],
- [username_1, wiki_3],
+ {'username': username_1, 'project': wiki_1},
+ {'username': username_1, 'project': wiki_2},
+ {'username': username_1, 'project': wiki_3},
], records)
records = self.expand(
- [[username_2, wiki_1]],
- self.caSession,
- self.default_project
+ [{'username': username_2, 'project': wiki_1}],
+ self.caSession
)
assert_equal([
- [username_2, wiki_1],
- [username_2, wiki_2],
+ {'username': username_2, 'project': wiki_1},
+ {'username': username_2, 'project': wiki_2},
], records)
records = self.expand(
- [[username_1, wiki_1], [username_2, wiki_1]],
- self.caSession,
- self.default_project
+ [
+ {'username': username_1, 'project': wiki_1},
+ {'username': username_2, 'project': wiki_1},
+ ],
+ self.caSession
)
assert_equal(len(records), 5)
@@ -59,12 +58,11 @@
'''
username, wiki = 'Non-existent', 'notimportant'
records = self.expand(
- [[username, wiki]],
- self.caSession,
- self.default_project
+ [{'username': username, 'project': wiki}],
+ self.caSession
)
assert_equal(len(records), 1)
- assert_equal(records[0][0], username)
+ assert_equal(records[0]['username'], username)
def test_expand_utf8_user(self):
'''
@@ -79,13 +77,12 @@
])
self.caSession.commit()
records = self.expand(
- [[username, wiki_1]],
- self.caSession,
- self.default_project
+ [{'username': username, 'project': wiki_1}],
+ self.caSession
)
assert_equal([
- [username, wiki_1],
- [username, wiki_2],
+ {'username': username, 'project': wiki_1},
+ {'username': username, 'project': wiki_2},
], records)
def test_expand_user_without_duplicates(self):
@@ -101,11 +98,13 @@
])
self.caSession.commit()
records = self.expand(
- [[username, wiki_1], [username, wiki_1]],
- self.caSession,
- self.default_project
+ [
+ {'username': username, 'project': wiki_1},
+ {'username': username, 'project': wiki_1},
+ ],
+ self.caSession
)
assert_equal([
- [username, wiki_1],
- [username, wiki_2],
+ {'username': username, 'project': wiki_1},
+ {'username': username, 'project': wiki_2},
], records)
diff --git a/tests/test_controllers/test_cohort_upload.py
b/tests/test_controllers/test_cohort_upload.py
index f8ac4be..7cf69eb 100644
--- a/tests/test_controllers/test_cohort_upload.py
+++ b/tests/test_controllers/test_cohort_upload.py
@@ -50,26 +50,26 @@
def test_parse_textarea_usernames(self):
unparsed = 'dan,en\rv\n,\r\nsomething with spaces'
parsed = parse_textarea_usernames(unparsed)
- assert_equal(parsed.next(), ['dan', 'en'])
- assert_equal(parsed.next(), ['v'])
- assert_equal(parsed.next(), ['', ''])
- assert_equal(parsed.next(), ['something with spaces'])
- assert_raises(StopIteration, parsed.next)
+ assert_equal(parsed[0], 'dan,en')
+ assert_equal(parsed[1], 'v')
+ assert_equal(parsed[2], ',')
+ assert_equal(parsed[3], 'something with spaces')
+ assert_equal(len(parsed), 4)
# needs to deal with unicode types as that is what this
# method will get from flask
unparsed = u'تيسير سامى سلامة,en\rv\n,\r\nsomething with spaces'
parsed = parse_textarea_usernames(unparsed)
- username = parsed.next()[0]
+ username = parsed[0]
# username will be just plain bytes, convert to unicode
# to be able to compare
- assert_equal(username.decode("utf-8"), u'تيسير سامى سلامة')
+ assert_equal(username.decode("utf-8"), u'تيسير سامى سلامة,en')
def test_format_records_with_project(self):
parsed = format_records(
[
- ['dan', 'wiki'],
- ['v', 'wiki'],
- [',', 'wiki']
+ 'dan,wiki',
+ 'v,wiki',
+ ',,wiki',
],
None
)
@@ -81,10 +81,7 @@
def test_format_records_without_project(self):
parsed = format_records(
- [
- ['dan'],
- ['v']
- ],
+ ['dan', 'v'],
'wiki'
)
assert_equal(len(parsed), 2)
@@ -95,9 +92,7 @@
def test_format_records_with_shorthand_project(self):
parsed = format_records(
- [
- ['dan', 'en']
- ],
+ ['dan,en'],
None
)
assert_equal(len(parsed), 1)
@@ -111,11 +106,18 @@
'''
username = u'Kán'.encode("utf-8")
parsed = format_records(
- [
- [username, 'en']
- ],
+ [username + ',en'],
None
)
assert_equal(len(parsed), 1)
assert_equal(parsed[0]['username'], u'Kán'.encode("utf-8"))
assert_equal(parsed[0]['project'], 'en')
+
+ def test_format_records_with_spaces_in_project(self):
+ parsed = format_records(
+ ['dan, en'],
+ None
+ )
+ assert_equal(len(parsed), 1)
+ assert_equal(parsed[0]['username'], 'Dan')
+ assert_equal(parsed[0]['project'], 'en')
diff --git a/tests/test_controllers/test_cohorts.py
b/tests/test_controllers/test_cohorts.py
index 18f2c2d..eda84d1 100644
--- a/tests/test_controllers/test_cohorts.py
+++ b/tests/test_controllers/test_cohorts.py
@@ -516,12 +516,12 @@
assert_true(response.data.find('Please fix validation problems') >= 0)
def test_upload_with_centralauth_expansion_works(self):
- data = 'actual validation tested elsewhere'
+ data = 'Actual validation tested elsewhere'
centralauth_called = [False]
- def expand_via_ca_mock(cohort_users, session, default_project):
+ def expand_via_ca_mock(cohort_users, session):
+ assert_equal(cohort_users[0]['username'], data)
centralauth_called[0] = True
- assert_true([data] in cohort_users)
return []
ca_service_mock = CentralAuthService()
@@ -539,7 +539,7 @@
assert_true(response.data.find('href="/cohorts/#') >= 0)
def test_normal_upload_does_not_call_centralauth(self):
- data = 'actual validation tested elsewhere'
+ data = 'Actual validation tested elsewhere'
centralauth_called = [False]
def expand_via_ca_mock(cohort_users, session):
diff --git a/wikimetrics/api/centralauth.py b/wikimetrics/api/centralauth.py
index 50e84d3..1a12841 100644
--- a/wikimetrics/api/centralauth.py
+++ b/wikimetrics/api/centralauth.py
@@ -5,30 +5,51 @@
class CentralAuthService(object):
- def expand_via_centralauth(self, cohort_users, ca_session,
default_project):
+ def expand_via_centralauth(self, cohort_users, ca_session):
'''
- Returns all centralauth local users (name, project)
- that share the name with the passed cohort users.
- If a cohort user is not in the centralauth localuser table
+ Tries to find users with the original user's names in other projects.
+ And adds them to the cohort. So the cohort will include all projects
+ of the given users. i.e. given the cohort:
+
+ [{'username': 'JSmith (WMF)', 'project': 'enwiki'}]
+
+ and considering the given user also has an account in 'dewiki' and
+ 'frwiktionary', it will return:
+
+ [{'username': 'JSmith (WMF)', 'project': 'enwiki'},
+ {'username': 'JSmith (WMF)', 'project': 'dewiki'},
+ {'username': 'JSmith (WMF)', 'project': 'frwiktionary'}]
+
+ The method uses the centralauth database for that.
+ If a cohort user is not in the centralauth localuser table,
it will be returned as is.
+
+ Parameters
+ cohot_users : List of cohort users, formatted like
+ {'username': username, 'project': project}
+ ca_session : Centralauth database session
+
+ Returns
+ List of expanded cohort users, formatted like
+ {'username': username, 'project': project}.
'''
expanded_cohort = set([])
for cohort_user in cohort_users:
- username = parse_username(cohort_user[0])
- if len(cohort_user) > 1:
- project = cohort_user[-1]
- else:
- project = default_project
- expanded_cohort.add((username, project))
+ user_tuple = (cohort_user['username'], cohort_user['project'])
+ expanded_cohort.add(user_tuple)
ca_local_users = (
ca_session
.query(LocalUser.lu_name, LocalUser.lu_wiki)
- .filter(LocalUser.lu_name == username).all()
+ .filter(LocalUser.lu_name == cohort_user['username']).all()
)
if len(ca_local_users) > 0:
ca_local_users = set([(x[0], x[1]) for x in ca_local_users])
expanded_cohort = expanded_cohort.union(ca_local_users)
- expanded_cohort = [list(x) for x in expanded_cohort]
- return sorted(expanded_cohort)
+ expanded_cohort = [{
+ 'username' : x[0],
+ 'project' : x[1]
+ } for x in expanded_cohort]
+ expanded_cohort.sort(key=lambda x: [x['username'], x['project']])
+ return expanded_cohort
diff --git a/wikimetrics/forms/cohort_upload.py
b/wikimetrics/forms/cohort_upload.py
index 1a4e1bb..12502aa 100644
--- a/wikimetrics/forms/cohort_upload.py
+++ b/wikimetrics/forms/cohort_upload.py
@@ -1,5 +1,4 @@
import csv
-import requests
from flask import g
from wikimetrics.configurables import db
from wtforms import StringField, FileField, TextAreaField, RadioField
@@ -55,39 +54,37 @@
if self.csv.data:
# uploaded a cohort file
- csv_file = normalize_newlines(self.csv.data.stream)
- unparsed = csv.reader(csv_file)
+ csv_lines = normalize_newlines(self.csv.data.stream)
else:
- #TODO: Check valid input
# used upload box, note that wtforms returns unicode types here
- unparsed = parse_textarea_usernames(self.paste_username.data)
+ csv_lines = parse_textarea_usernames(self.paste_username.data)
+
+ records = format_records(csv_lines, self.project.data)
if self.centralauth.data is True:
ca_session = db.get_ca_session()
- unparsed = g.centralauth_service.expand_via_centralauth(
- unparsed, ca_session, self.project.data)
+ records = g.centralauth_service.expand_via_centralauth(records,
ca_session)
- self.records = format_records(unparsed, self.project.data)
+ self.records = records
-def format_records(unparsed, default_project):
+def format_records(csv_lines, default_project):
"""
- Process and formats records read from a csv file or coming from the upload
box
+ Processes and formats lines read from a csv file or coming from the upload
box.
+ i.e. "dan,en" becomes {'username': 'dan', 'project': 'en'}.
Note this method assumes bytes (str) not unicode types as input
Parameters
- unparsed : records in array form
+ csv_lines : collection of strings, each with csv format
default_project : the default project to attribute to records without
one
Returns
a list of the formatted records in which each element is of this form:
{'username':'parsed username', 'project':'as specified or default'}
-
-
"""
records = []
- for r in unparsed:
+ for r in csv.reader(csv_lines):
if r is not None and len(r) > 0:
# NOTE: the reason for the crazy -1 and comma joins
# is that some users can have commas in their name
@@ -96,7 +93,7 @@
# and the username to be the last or maybe change to a tsv format
if len(r) > 1:
username = ",".join([str(p) for p in r[:-1]])
- project = r[-1] or default_project
+ project = r[-1].strip() or default_project
else:
username = r[0]
project = default_project
@@ -120,24 +117,19 @@
yield line
-def parse_textarea_usernames(paste_username):
+def parse_textarea_usernames(textarea_contents):
"""
- Takes text and parses it into a list of lists of usernames
- and their wiki. i.e. "dan,en v" becomes [['dan','en'],['v']]. Whitespace is
- the delimiter of each list. Prepares text to go through parse_records().
+ Prepares text to go through format_records().
- Note that paste_username is going to be a unicode type as flask builds it
+ Note that textarea_contents is going to be a unicode type as flask builds
it
that way. Output is plain bytes (str type) as the rest of our code
does not assume unicode types.
Parameters
- paste_username : lines representing a username and (optionally)
- a project separated by comas
+ textarea_contents : the text pasted into the textarea.
Returns
- list with username at index 0 and project at index 1
+ list of strings, each holding a line of the input.
Unicode types are transformed to bytes.
"""
- for username in paste_username.splitlines():
- _username = username.encode("utf-8")
- yield _username.strip().split(',')
+ return textarea_contents.encode('utf-8').splitlines()
--
To view, visit https://gerrit.wikimedia.org/r/180071
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I301b8c2cafbf4de98b2fd2c2ead9cd74b254f32f
Gerrit-PatchSet: 5
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Mforns <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits