Bmansurov has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/403938 )
Change subject: WIP: Add support for category-based section recommendations
......................................................................
WIP: Add support for category-based section recommendations
To get relevant sections for the Zoology_book category, ping
/section/category/Zoology_book. The result will include a list of
section names and their relevance scores, e.g.:
[["Reception",0.30434782608695654],
["Contents",0.17391304347826086],
["Overview",0.17391304347826086],
...]
The patch assumes that models for various languages exist in a database.
The original model data is in JSON, which can be converted to SQL using
scripts/cat-recs-json-to-sql.py. The table format is descirbed in
scripts/cat-recs-table.sql.
Bug: T183043
Change-Id: I748f174842f91bdd4f401d123a47be344eef920c
---
M config.dev.yaml
M package.json
A routes/section.js
A scripts/cat-recs-json-to-sql.py
A scripts/cat-recs-table.sql
5 files changed, 183 insertions(+), 2 deletions(-)
git pull
ssh://gerrit.wikimedia.org:29418/mediawiki/services/recommendation-api
refs/changes/38/403938/1
diff --git a/config.dev.yaml b/config.dev.yaml
index aa01eec..d59d933 100644
--- a/config.dev.yaml
+++ b/config.dev.yaml
@@ -115,3 +115,16 @@
gsrlimit: 1
gsrsearch: '{{params.seed}}'
gsrprop: ''
+ mysql:
+ connection_limit: 10
+ host: 'localhost'
+ user: 'user'
+ password : 'password'
+ database : 'database'
+ tables:
+ # {{lang}} comes from section.category_models
+ category_to_sections: '{{lang}}_category_to_sections'
+ section:
+ category_models:
+ - 'en'
+ - 'fr'
\ No newline at end of file
diff --git a/package.json b/package.json
index 25d40c7..9625ad9 100644
--- a/package.json
+++ b/package.json
@@ -37,12 +37,13 @@
"compression": "^1.7.1",
"domino": "^1.0.30",
"express": "^4.16.2",
+ "http-shutdown": "^1.2.0",
"js-yaml": "^3.10.0",
+ "mysql": "^2.15.0",
"preq": "^0.5.3",
"service-runner": "^2.4.2",
"swagger-router": "^0.7.1",
- "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master",
- "http-shutdown": "^1.2.0"
+ "swagger-ui": "git+https://github.com/wikimedia/swagger-ui#master"
},
"devDependencies": {
"ajv": "^5.5.0",
diff --git a/routes/section.js b/routes/section.js
new file mode 100644
index 0000000..dd54460
--- /dev/null
+++ b/routes/section.js
@@ -0,0 +1,88 @@
+'use strict';
+
+const mysql = require('mysql');
+const Template = require('swagger-router').Template;
+
+const sUtil = require('../lib/util');
+
+
+/**
+ * The main router object
+ */
+const router = sUtil.router();
+
+/**
+ * The main application object reported when this module is require()d
+ */
+let app;
+
+/**
+ * Call callback with sections matching category in language
+ * @param {string} language model language
+ * @param {string} seed category name
+ * @param {Function<Object, Object[]>} callback that accepts error and results
+ */
+function recommendFromCategory(language, seed, callback) {
+ const tableTpl = new Template(app.conf.mysql.tables.category_to_sections);
+ const table = tableTpl.expand({
+ lang: language
+ });
+ app.mysqlPool.query(
+ `SELECT sections FROM ?? WHERE category=?`,
+ [table, seed],
+ (error, results, _) => {
+ if (error) {
+ app.logger.log('error/db', error);
+ }
+ callback(
+ error,
+ results && results.length ? JSON.parse(results[0].sections) :
[]
+ );
+ });
+}
+
+/**
+ * GET /category/{seed}
+ * Gets sections based on category and language
+ */
+router.get('/category/:seed', (req, res) => {
+ const language = req.params.domain.split('.')[0]; // e.g. en
+ if (!app.conf.section.category_models.includes(language)) {
+ app.logger.log('error/section',
+ `Model for "${language}" doesn't exist.`);
+ throw new sUtil.HTTPError({
+ status: 400
+ });
+ }
+ recommendFromCategory(language, req.params.seed, (error, result) => {
+ if (error) {
+ const errorObject = new sUtil.HTTPError({ status: 400 });
+ res.status(errorObject.status).send(errorObject);
+ } else {
+ res.json(result);
+ }
+ });
+});
+
+
+module.exports = function(appObj) {
+
+ app = appObj;
+
+ const mysqlConf = app.conf.mysql;
+ app.mysqlPool = mysql.createPool({
+ connectionLimit: mysqlConf.connection_limit,
+ host: mysqlConf.host,
+ user: mysqlConf.user,
+ password: mysqlConf.password,
+ database: mysqlConf.database
+ });
+
+ return {
+ path: '/section',
+ api_version: 1,
+ router
+ };
+
+};
+
diff --git a/scripts/cat-recs-json-to-sql.py b/scripts/cat-recs-json-to-sql.py
new file mode 100755
index 0000000..e1a203c
--- /dev/null
+++ b/scripts/cat-recs-json-to-sql.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+
+# Data is currently available at:
+#
https://meta.wikimedia.org/wiki/Research:Expanding_Wikipedia_articles_across_languages/Data
+
+import json
+
+in_file = "en_category_recommendations.json"
+out_file = "en_category_recommendations.sql"
+table_name = "en_category_to_sections"
+
+
+def convert_json_to_sql(in_file, out_file, table_name):
+ """Convert recommendations JSON file into an SQL file ready to be
+ imported into a database. The table format is described in
+ cat-recs-table.sql.
+
+ The original file format looks like so:
+ {
+ "category": "Canadian_Pacific_Railway_executives",
+ "recs": [
+ {
+ "relevance": 0.375,
+ "title": "Early life"
+ },
+ {
+ "relevance": 0.25,
+ "title": "Philanthropy"
+ },
+ ...
+ ]
+ }
+
+ The SQL format looks like so (where sections are ordered by
+ relevance first and then lexicographically:
+ INSERT INTO en_category_to_sections(category, sections)
+ VALUES('Canadian_Pacific_Railway_executives',
+ '[["Early life", 0.375], ["Biography", 0.25], ...]');
+ """
+ query = "INSERT INTO %s(category, sections) VALUES('%s', \'%s\');\n"
+ with open(in_file, "r") as inf:
+ with open(out_file, "w") as outf:
+ for line in inf:
+ data = json.loads(line)
+ recs = [(x['title'].replace("'", "''"), x['relevance'])
+ for x in data['recs']]
+ recs.sort(key=lambda x: x[0])
+ recs.sort(key=lambda x: x[1], reverse=True)
+ outf.write(
+ query
+ % (table_name,
+ data['category'].replace("'", "''"),
+ json.dumps(recs)))
+
+
+if __name__ == '__main__':
+ convert_json_to_sql(in_file, out_file, table_name)
diff --git a/scripts/cat-recs-table.sql b/scripts/cat-recs-table.sql
new file mode 100644
index 0000000..2e1e9fc
--- /dev/null
+++ b/scripts/cat-recs-table.sql
@@ -0,0 +1,22 @@
+CREATE TABLE en_category_to_sections (
+ id int NOT NULL AUTO_INCREMENT,
+ /*
+ At the time of writing this, the longest enwiki category length is
+ 116 characters:
+ United_Nations_Security_Council_resolutions_concerning_the_\
+ International_Criminal_Tribunal_for_the_former_Yugoslavia
+ For French the number is 111:
+ Catégorie:Chanteur_américain_dont_l'œuvre_est_marquée_par_\
+ l'homosexualité,_la_bisexualité_ou_le_transgendérisme
+ */
+ category varchar(190) NOT NULL,
+ /*
+ * Contains a list of sections with their relevance, e.g.:
+ * '[["Early life", 0.375],
+ * ["Biography", 0.25], ...]
+ */
+ sections text NOT NULL,
+ PRIMARY KEY (id)
+) ENGINE=InnoDB;
+
+CREATE INDEX category_index ON en_category_to_sections (category) USING HASH;
--
To view, visit https://gerrit.wikimedia.org/r/403938
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I748f174842f91bdd4f401d123a47be344eef920c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/recommendation-api
Gerrit-Branch: master
Gerrit-Owner: Bmansurov <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits