Milimetric has submitted this change and it was merged.

Change subject: Strip project when uploading cohort
......................................................................


Strip project when uploading cohort

When a cohort is uploaded with blank space between the comma
that separates the username and the project, and the project
itself; centralauth service is not capable of unifying i.e.
'enwiki' and ' enwiki' and this causes a validation error.

This fix adresses that problem and adds a test for the
specific case.

Bug: T78584
Change-Id: I301b8c2cafbf4de98b2fd2c2ead9cd74b254f32f
---
M tests/test_api/test_centralauth_service.py
M tests/test_controllers/test_cohort_upload.py
M tests/test_controllers/test_cohorts.py
M wikimetrics/api/centralauth.py
M wikimetrics/forms/cohort_upload.py
5 files changed, 106 insertions(+), 92 deletions(-)

Approvals:
  Milimetric: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/tests/test_api/test_centralauth_service.py 
b/tests/test_api/test_centralauth_service.py
index 7f85cf5..637ed42 100644
--- a/tests/test_api/test_centralauth_service.py
+++ b/tests/test_api/test_centralauth_service.py
@@ -9,7 +9,6 @@
     def setUp(self):
         DatabaseTest.setUp(self)
         self.expand = CentralAuthService().expand_via_centralauth
-        self.default_project = 'defaultproject'
 
     def test_expand_via_centralauth(self):
         username_1, username_2 = 'User 1', 'User 2'
@@ -24,30 +23,30 @@
         self.caSession.commit()
 
         records = self.expand(
-            [[username_1, wiki_1]],
-            self.caSession,
-            self.default_project
+            [{'username': username_1, 'project': wiki_1}],
+            self.caSession
         )
         assert_equal([
-            [username_1, wiki_1],
-            [username_1, wiki_2],
-            [username_1, wiki_3],
+            {'username': username_1, 'project': wiki_1},
+            {'username': username_1, 'project': wiki_2},
+            {'username': username_1, 'project': wiki_3},
         ], records)
 
         records = self.expand(
-            [[username_2, wiki_1]],
-            self.caSession,
-            self.default_project
+            [{'username': username_2, 'project': wiki_1}],
+            self.caSession
         )
         assert_equal([
-            [username_2, wiki_1],
-            [username_2, wiki_2],
+            {'username': username_2, 'project': wiki_1},
+            {'username': username_2, 'project': wiki_2},
         ], records)
 
         records = self.expand(
-            [[username_1, wiki_1], [username_2, wiki_1]],
-            self.caSession,
-            self.default_project
+            [
+                {'username': username_1, 'project': wiki_1},
+                {'username': username_2, 'project': wiki_1},
+            ],
+            self.caSession
         )
         assert_equal(len(records), 5)
 
@@ -59,12 +58,11 @@
         '''
         username, wiki = 'Non-existent', 'notimportant'
         records = self.expand(
-            [[username, wiki]],
-            self.caSession,
-            self.default_project
+            [{'username': username, 'project': wiki}],
+            self.caSession
         )
         assert_equal(len(records), 1)
-        assert_equal(records[0][0], username)
+        assert_equal(records[0]['username'], username)
 
     def test_expand_utf8_user(self):
         '''
@@ -79,13 +77,12 @@
         ])
         self.caSession.commit()
         records = self.expand(
-            [[username, wiki_1]],
-            self.caSession,
-            self.default_project
+            [{'username': username, 'project': wiki_1}],
+            self.caSession
         )
         assert_equal([
-            [username, wiki_1],
-            [username, wiki_2],
+            {'username': username, 'project': wiki_1},
+            {'username': username, 'project': wiki_2},
         ], records)
 
     def test_expand_user_without_duplicates(self):
@@ -101,11 +98,13 @@
         ])
         self.caSession.commit()
         records = self.expand(
-            [[username, wiki_1], [username, wiki_1]],
-            self.caSession,
-            self.default_project
+            [
+                {'username': username, 'project': wiki_1},
+                {'username': username, 'project': wiki_1},
+            ],
+            self.caSession
         )
         assert_equal([
-            [username, wiki_1],
-            [username, wiki_2],
+            {'username': username, 'project': wiki_1},
+            {'username': username, 'project': wiki_2},
         ], records)
diff --git a/tests/test_controllers/test_cohort_upload.py 
b/tests/test_controllers/test_cohort_upload.py
index f8ac4be..7cf69eb 100644
--- a/tests/test_controllers/test_cohort_upload.py
+++ b/tests/test_controllers/test_cohort_upload.py
@@ -50,26 +50,26 @@
     def test_parse_textarea_usernames(self):
         unparsed = 'dan,en\rv\n,\r\nsomething with spaces'
         parsed = parse_textarea_usernames(unparsed)
-        assert_equal(parsed.next(), ['dan', 'en'])
-        assert_equal(parsed.next(), ['v'])
-        assert_equal(parsed.next(), ['', ''])
-        assert_equal(parsed.next(), ['something with spaces'])
-        assert_raises(StopIteration, parsed.next)
+        assert_equal(parsed[0], 'dan,en')
+        assert_equal(parsed[1], 'v')
+        assert_equal(parsed[2], ',')
+        assert_equal(parsed[3], 'something with spaces')
+        assert_equal(len(parsed), 4)
         # needs to deal with unicode types as that is what this
         # method will get from flask
         unparsed = u'تيسير سامى سلامة,en\rv\n,\r\nsomething with spaces'
         parsed = parse_textarea_usernames(unparsed)
-        username = parsed.next()[0]
+        username = parsed[0]
         # username will be just plain bytes, convert to unicode
         # to be able to compare
-        assert_equal(username.decode("utf-8"), u'تيسير سامى سلامة')
+        assert_equal(username.decode("utf-8"), u'تيسير سامى سلامة,en')
 
     def test_format_records_with_project(self):
         parsed = format_records(
             [
-                ['dan', 'wiki'],
-                ['v', 'wiki'],
-                [',', 'wiki']
+                'dan,wiki',
+                'v,wiki',
+                ',,wiki',
             ],
             None
         )
@@ -81,10 +81,7 @@
 
     def test_format_records_without_project(self):
         parsed = format_records(
-            [
-                ['dan'],
-                ['v']
-            ],
+            ['dan', 'v'],
             'wiki'
         )
         assert_equal(len(parsed), 2)
@@ -95,9 +92,7 @@
 
     def test_format_records_with_shorthand_project(self):
         parsed = format_records(
-            [
-                ['dan', 'en']
-            ],
+            ['dan,en'],
             None
         )
         assert_equal(len(parsed), 1)
@@ -111,11 +106,18 @@
         '''
         username = u'Kán'.encode("utf-8")
         parsed = format_records(
-            [
-                [username, 'en']
-            ],
+            [username + ',en'],
             None
         )
         assert_equal(len(parsed), 1)
         assert_equal(parsed[0]['username'], u'Kán'.encode("utf-8"))
         assert_equal(parsed[0]['project'], 'en')
+
+    def test_format_records_with_spaces_in_project(self):
+        parsed = format_records(
+            ['dan, en'],
+            None
+        )
+        assert_equal(len(parsed), 1)
+        assert_equal(parsed[0]['username'], 'Dan')
+        assert_equal(parsed[0]['project'], 'en')
diff --git a/tests/test_controllers/test_cohorts.py 
b/tests/test_controllers/test_cohorts.py
index 18f2c2d..eda84d1 100644
--- a/tests/test_controllers/test_cohorts.py
+++ b/tests/test_controllers/test_cohorts.py
@@ -516,12 +516,12 @@
         assert_true(response.data.find('Please fix validation problems') >= 0)
 
     def test_upload_with_centralauth_expansion_works(self):
-        data = 'actual validation tested elsewhere'
+        data = 'Actual validation tested elsewhere'
         centralauth_called = [False]
 
-        def expand_via_ca_mock(cohort_users, session, default_project):
+        def expand_via_ca_mock(cohort_users, session):
+            assert_equal(cohort_users[0]['username'], data)
             centralauth_called[0] = True
-            assert_true([data] in cohort_users)
             return []
 
         ca_service_mock = CentralAuthService()
@@ -539,7 +539,7 @@
         assert_true(response.data.find('href="/cohorts/#') >= 0)
 
     def test_normal_upload_does_not_call_centralauth(self):
-        data = 'actual validation tested elsewhere'
+        data = 'Actual validation tested elsewhere'
         centralauth_called = [False]
 
         def expand_via_ca_mock(cohort_users, session):
diff --git a/wikimetrics/api/centralauth.py b/wikimetrics/api/centralauth.py
index 50e84d3..1a12841 100644
--- a/wikimetrics/api/centralauth.py
+++ b/wikimetrics/api/centralauth.py
@@ -5,30 +5,51 @@
 
 
 class CentralAuthService(object):
-    def expand_via_centralauth(self, cohort_users, ca_session, 
default_project):
+    def expand_via_centralauth(self, cohort_users, ca_session):
         '''
-        Returns all centralauth local users (name, project)
-        that share the name with the passed cohort users.
-        If a cohort user is not in the centralauth localuser table
+        Tries to find users with the original user's names in other projects.
+        And adds them to the cohort. So the cohort will include all projects
+        of the given users. i.e. given the cohort:
+
+        [{'username': 'JSmith (WMF)', 'project': 'enwiki'}]
+
+        and considering the given user also has an account in 'dewiki' and
+        'frwiktionary', it will return:
+
+        [{'username': 'JSmith (WMF)', 'project': 'enwiki'},
+         {'username': 'JSmith (WMF)', 'project': 'dewiki'},
+         {'username': 'JSmith (WMF)', 'project': 'frwiktionary'}]
+
+        The method uses the centralauth database for that.
+        If a cohort user is not in the centralauth localuser table,
         it will be returned as is.
+
+        Parameters
+            cohot_users : List of cohort users, formatted like
+                          {'username': username, 'project': project}
+            ca_session  : Centralauth database session
+
+        Returns
+            List of expanded cohort users, formatted like
+            {'username': username, 'project': project}.
         '''
         expanded_cohort = set([])
         for cohort_user in cohort_users:
-            username = parse_username(cohort_user[0])
-            if len(cohort_user) > 1:
-                project = cohort_user[-1]
-            else:
-                project = default_project
-            expanded_cohort.add((username, project))
+            user_tuple = (cohort_user['username'], cohort_user['project'])
+            expanded_cohort.add(user_tuple)
 
             ca_local_users = (
                 ca_session
                 .query(LocalUser.lu_name, LocalUser.lu_wiki)
-                .filter(LocalUser.lu_name == username).all()
+                .filter(LocalUser.lu_name == cohort_user['username']).all()
             )
             if len(ca_local_users) > 0:
                 ca_local_users = set([(x[0], x[1]) for x in ca_local_users])
                 expanded_cohort = expanded_cohort.union(ca_local_users)
 
-        expanded_cohort = [list(x) for x in expanded_cohort]
-        return sorted(expanded_cohort)
+        expanded_cohort = [{
+            'username' : x[0],
+            'project'  : x[1]
+        } for x in expanded_cohort]
+        expanded_cohort.sort(key=lambda x: [x['username'], x['project']])
+        return expanded_cohort
diff --git a/wikimetrics/forms/cohort_upload.py 
b/wikimetrics/forms/cohort_upload.py
index 1a4e1bb..12502aa 100644
--- a/wikimetrics/forms/cohort_upload.py
+++ b/wikimetrics/forms/cohort_upload.py
@@ -1,5 +1,4 @@
 import csv
-import requests
 from flask import g
 from wikimetrics.configurables import db
 from wtforms import StringField, FileField, TextAreaField, RadioField
@@ -55,39 +54,37 @@
 
         if self.csv.data:
             # uploaded a cohort file
-            csv_file = normalize_newlines(self.csv.data.stream)
-            unparsed = csv.reader(csv_file)
+            csv_lines = normalize_newlines(self.csv.data.stream)
 
         else:
-            #TODO: Check valid input
             # used upload box, note that wtforms returns unicode types here
-            unparsed = parse_textarea_usernames(self.paste_username.data)
+            csv_lines = parse_textarea_usernames(self.paste_username.data)
+
+        records = format_records(csv_lines, self.project.data)
 
         if self.centralauth.data is True:
             ca_session = db.get_ca_session()
-            unparsed = g.centralauth_service.expand_via_centralauth(
-                unparsed, ca_session, self.project.data)
+            records = g.centralauth_service.expand_via_centralauth(records, 
ca_session)
 
-        self.records = format_records(unparsed, self.project.data)
+        self.records = records
 
 
-def format_records(unparsed, default_project):
+def format_records(csv_lines, default_project):
     """
-    Process and formats records read from a csv file or coming from the upload 
box
+    Processes and formats lines read from a csv file or coming from the upload 
box.
+    i.e. "dan,en" becomes {'username': 'dan', 'project': 'en'}.
     Note this method assumes bytes (str) not unicode types as input
 
     Parameters
-        unparsed        : records in array form
+        csv_lines       : collection of strings, each with csv format
         default_project : the default project to attribute to records without 
one
 
     Returns
         a list of the formatted records in which each element is of this form:
         {'username':'parsed username', 'project':'as specified or default'}
-
-
     """
     records = []
-    for r in unparsed:
+    for r in csv.reader(csv_lines):
         if r is not None and len(r) > 0:
             # NOTE: the reason for the crazy -1 and comma joins
             # is that some users can have commas in their name
@@ -96,7 +93,7 @@
             # and the username to be the last or maybe change to a tsv format
             if len(r) > 1:
                 username = ",".join([str(p) for p in r[:-1]])
-                project = r[-1] or default_project
+                project = r[-1].strip() or default_project
             else:
                 username = r[0]
                 project = default_project
@@ -120,24 +117,19 @@
             yield line
 
 
-def parse_textarea_usernames(paste_username):
+def parse_textarea_usernames(textarea_contents):
     """
-    Takes text and parses it into a list of lists of usernames
-    and their wiki. i.e. "dan,en v" becomes [['dan','en'],['v']]. Whitespace is
-    the delimiter of each list. Prepares text to go through parse_records().
+    Prepares text to go through format_records().
 
-    Note that paste_username is going to be a unicode type as flask builds it
+    Note that textarea_contents is going to be a unicode type as flask builds 
it
     that way. Output is plain bytes (str type) as the rest of our code
     does not assume unicode types.
 
     Parameters
-        paste_username : lines representing a username and (optionally)
-        a project separated by comas
+        textarea_contents : the text pasted into the textarea.
 
     Returns
-        list with username at index 0 and project at index 1
+        list of strings, each holding a line of the input.
         Unicode types are transformed to bytes.
     """
-    for username in paste_username.splitlines():
-        _username = username.encode("utf-8")
-        yield _username.strip().split(',')
+    return textarea_contents.encode('utf-8').splitlines()

-- 
To view, visit https://gerrit.wikimedia.org/r/180071
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I301b8c2cafbf4de98b2fd2c2ead9cd74b254f32f
Gerrit-PatchSet: 5
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Mforns <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to