Bmansurov has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403938 )

Change subject: WIP: Add support for category-based section recommendations
......................................................................

WIP: Add support for category-based section recommendations

To get relevant sections for the Zoology_book category, ping
/section/category/Zoology_book. The result will include a list of
section names and their relevance scores, e.g.:

[["Reception",0.30434782608695654],
 ["Contents",0.17391304347826086],
 ["Overview",0.17391304347826086],
 ...]

The patch assumes that models for various languages exist in a database.
The original model data is in JSON, which can be converted to SQL using
scripts/cat-recs-json-to-sql.py. The table format is descirbed in
scripts/cat-recs-table.sql.

Bug: T183043
Change-Id: I748f174842f91bdd4f401d123a47be344eef920c
---
M config.dev.yaml
M package.json
A routes/section.js
A scripts/cat-recs-json-to-sql.py
A scripts/cat-recs-table.sql
5 files changed, 183 insertions(+), 2 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/services/recommendation-api 
refs/changes/38/403938/1

diff --git a/config.dev.yaml b/config.dev.yaml
index aa01eec..d59d933 100644
--- a/config.dev.yaml
+++ b/config.dev.yaml
@@ -115,3 +115,16 @@
             gsrlimit: 1
             gsrsearch: '{{params.seed}}'
             gsrprop: ''
+      mysql:
+        connection_limit: 10
+        host: 'localhost'
+        user: 'user'
+        password : 'password'
+        database : 'database'
+        tables:
+          # {{lang}} comes from section.category_models
+          category_to_sections: '{{lang}}_category_to_sections'
+      section:
+        category_models:
+          - 'en'
+          - 'fr'
\ No newline at end of file
diff --git a/package.json b/package.json
index 25d40c7..9625ad9 100644
--- a/package.json
+++ b/package.json
@@ -37,12 +37,13 @@
     "compression": "^1.7.1",
     "domino": "^1.0.30",
     "express": "^4.16.2",
+    "http-shutdown": "^1.2.0",
     "js-yaml": "^3.10.0",
+    "mysql": "^2.15.0",
     "preq": "^0.5.3",
     "service-runner": "^2.4.2",
     "swagger-router": "^0.7.1",
-    "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master";,
-    "http-shutdown": "^1.2.0"
+    "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master";
   },
   "devDependencies": {
     "ajv": "^5.5.0",
diff --git a/routes/section.js b/routes/section.js
new file mode 100644
index 0000000..dd54460
--- /dev/null
+++ b/routes/section.js
@@ -0,0 +1,88 @@
+'use strict';
+
+const mysql = require('mysql');
+const Template = require('swagger-router').Template;
+
+const sUtil = require('../lib/util');
+
+
+/**
+ * The main router object
+ */
+const router = sUtil.router();
+
+/**
+ * The main application object reported when this module is require()d
+ */
+let app;
+
+/**
+ * Call callback with sections matching category in language
+ * @param {string} language model language
+ * @param {string} seed category name
+ * @param {Function<Object, Object[]>} callback that accepts error and results
+ */
+function recommendFromCategory(language, seed, callback) {
+    const tableTpl = new Template(app.conf.mysql.tables.category_to_sections);
+    const table = tableTpl.expand({
+        lang: language
+    });
+    app.mysqlPool.query(
+        `SELECT sections FROM ?? WHERE category=?`,
+        [table, seed],
+        (error, results, _) => {
+            if (error) {
+                app.logger.log('error/db', error);
+            }
+            callback(
+                error,
+                results && results.length ? JSON.parse(results[0].sections) : 
[]
+            );
+        });
+}
+
+/**
+ * GET /category/{seed}
+ * Gets sections based on category and language
+ */
+router.get('/category/:seed', (req, res) => {
+    const language = req.params.domain.split('.')[0];  // e.g. en
+    if (!app.conf.section.category_models.includes(language)) {
+        app.logger.log('error/section',
+            `Model for "${language}" doesn't exist.`);
+        throw new sUtil.HTTPError({
+            status: 400
+        });
+    }
+    recommendFromCategory(language, req.params.seed, (error, result) => {
+        if (error) {
+            const errorObject = new sUtil.HTTPError({ status: 400 });
+            res.status(errorObject.status).send(errorObject);
+        } else {
+            res.json(result);
+        }
+    });
+});
+
+
+module.exports = function(appObj) {
+
+    app = appObj;
+
+    const mysqlConf = app.conf.mysql;
+    app.mysqlPool = mysql.createPool({
+        connectionLimit: mysqlConf.connection_limit,
+        host: mysqlConf.host,
+        user: mysqlConf.user,
+        password: mysqlConf.password,
+        database: mysqlConf.database
+    });
+
+    return {
+        path: '/section',
+        api_version: 1,
+        router
+    };
+
+};
+
diff --git a/scripts/cat-recs-json-to-sql.py b/scripts/cat-recs-json-to-sql.py
new file mode 100755
index 0000000..e1a203c
--- /dev/null
+++ b/scripts/cat-recs-json-to-sql.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+
+# Data is currently available at:
+# 
https://meta.wikimedia.org/wiki/Research:Expanding_Wikipedia_articles_across_languages/Data
+
+import json
+
+in_file = "en_category_recommendations.json"
+out_file = "en_category_recommendations.sql"
+table_name = "en_category_to_sections"
+
+
+def convert_json_to_sql(in_file, out_file, table_name):
+    """Convert recommendations JSON file into an SQL file ready to be
+    imported into a database. The table format is described in
+    cat-recs-table.sql.
+
+    The original file format looks like so:
+        {
+          "category": "Canadian_Pacific_Railway_executives",
+          "recs": [
+            {
+              "relevance": 0.375,
+              "title": "Early life"
+            },
+            {
+              "relevance": 0.25,
+              "title": "Philanthropy"
+            },
+            ...
+          ]
+        }
+
+    The SQL format looks like so (where sections are ordered by
+    relevance first and then lexicographically:
+        INSERT INTO en_category_to_sections(category, sections)
+         VALUES('Canadian_Pacific_Railway_executives',
+                '[["Early life", 0.375], ["Biography", 0.25], ...]');
+    """
+    query = "INSERT INTO %s(category, sections) VALUES('%s', \'%s\');\n"
+    with open(in_file, "r") as inf:
+        with open(out_file, "w") as outf:
+            for line in inf:
+                data = json.loads(line)
+                recs = [(x['title'].replace("'", "''"), x['relevance'])
+                        for x in data['recs']]
+                recs.sort(key=lambda x: x[0])
+                recs.sort(key=lambda x: x[1], reverse=True)
+                outf.write(
+                    query
+                    % (table_name,
+                       data['category'].replace("'", "''"),
+                       json.dumps(recs)))
+
+
+if __name__ == '__main__':
+    convert_json_to_sql(in_file, out_file, table_name)
diff --git a/scripts/cat-recs-table.sql b/scripts/cat-recs-table.sql
new file mode 100644
index 0000000..2e1e9fc
--- /dev/null
+++ b/scripts/cat-recs-table.sql
@@ -0,0 +1,22 @@
+CREATE TABLE en_category_to_sections (
+    id int NOT NULL AUTO_INCREMENT,
+    /*
+    At the time of writing this, the longest enwiki category length is
+    116 characters:
+        United_Nations_Security_Council_resolutions_concerning_the_\
+        International_Criminal_Tribunal_for_the_former_Yugoslavia
+    For French the number is 111:
+        Catégorie:Chanteur_américain_dont_l'œuvre_est_marquée_par_\
+        l'homosexualité,_la_bisexualité_ou_le_transgendérisme
+    */
+    category varchar(190) NOT NULL,
+    /*
+     * Contains a list of sections with their relevance, e.g.:
+     * '[["Early life", 0.375],
+     *   ["Biography", 0.25], ...]
+     */
+    sections text NOT NULL,
+    PRIMARY KEY (id)
+) ENGINE=InnoDB;
+
+CREATE INDEX category_index ON en_category_to_sections (category) USING HASH;

-- 
To view, visit https://gerrit.wikimedia.org/r/403938
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I748f174842f91bdd4f401d123a47be344eef920c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/recommendation-api
Gerrit-Branch: master
Gerrit-Owner: Bmansurov <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to