Brion VIBBER has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/350097 )
Change subject: WIP - provisional revision table restructure ...................................................................... WIP - provisional revision table restructure Revision table compaction and prep work for MCR Currently includes (untested) tables.sql and a partial patch file for offline restructure (also untested). Still todo: * apply enough changes to archive to support deletion * decide on content_text_id vs content_address * hook up the updater * start prepping code to work with this schema! * online migration state management Open questions: * migrate format of existing archive stuff? * apply 'actor', 'comment' split-out change to non-revision stuff too, or wait to later? * is internal API on Revision enough to cover many changes, or... * ...do we need better API abstractions first? Bug: T161671 Change-Id: I50ee6506ff71e7613e50c33d865199b07918c1de --- A maintenance/archives/patch-revision-restructure.sql M maintenance/tables.sql 2 files changed, 234 insertions(+), 32 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/97/350097/1 diff --git a/maintenance/archives/patch-revision-restructure.sql b/maintenance/archives/patch-revision-restructure.sql new file mode 100644 index 0000000..b802992 --- /dev/null +++ b/maintenance/archives/patch-revision-restructure.sql @@ -0,0 +1,109 @@ +-- @TODO consider splitting this to three patch files: setup, migration, cleanup. + +-- +-- Setup: +-- Create the new tables, and add new columns to revision... +-- + +-- the table creations are in tables.sql, need to be copied here: +-- @TODO create comment +-- @TODO create actor +-- @TODO create content +-- @TODO create slots + +-- @TODO create slot_role +-- @TODO populate slot_role +-- @TODO create content_format +-- @TODO populate content_format +-- @TODO create content_model +-- @TODO populate content_model + +-- Add new fields to revision +ALTER TABLE revision + CHANGE COLUMN rev_id rev_id bigint unsigned NOT NULL PRIMARY KEY AUTO_INCREMENT, + ADD COLUMN rev_comment_id bigint + AFTER rev_page, + ADD COLUMN rev_actor int unsigned NOT NULL default 0 + AFTER rev_comment_id, +; +CREATE INDEX /*i*/actor_timestamp ON /*_*/revision (rev_actor,rev_timestamp); +CREATE INDEX /*i*/page_actor_timestamp ON /*_*/revision (rev_page,rev_actor,rev_timestamp); + +-- +-- Migration: +-- Move data from the old fields to the new tables, brute-force style. +-- +-- Production upgrade would not run this, instead migrating data in small bits +-- through a batch process. +-- +-- If this is run on a small wiki via update.php it should work, but requires +-- the wiki to be offline and won't provide any feedback about how long it +-- takes. +-- +-- A smarter updater plugin could do the migration in the same way we'd do it +-- in production, which might be safer. +-- + +-- Migrate comments... +INSERT INTO comment (comment_id, comment_text) + SELECT rev_id, rev_comment + FROM revision + ORDER BY rev_id; +UPDATE revision + SET rev_comment_id=rev_id + WHERE rev_comment_id IS NULL; + +-- Migrate actors... +INSERT INTO actor (actor_user) + SELECT rev_user FROM user + WHERE rev_user > 0 + GROUP BY rev_user + ORDER BY rev_user; +INSERT INTO actor (actor_text) + SELECT rev_user_text FROM revision + WHERE rev_user = 0 + GROUP BY rev_user_text + ORDER BY rev_user_text; +UPDATE revision + SET rev_actor = (SELECT actor_id FROM actor WHERE actor_user=rev_user) + WHERE rev_user > 0 AND rev_actor IS NULL; +UPDATE revision + SET rev_actor = (SELECT actor_id FROM actor WHERE actor_text=rev_user_text) + WHERE rev_user == 0 AND rev_actor IS NULL; + +-- Migrate content... +INSERT INTO content (cont_id,cont_text_id,cont_len,cont_sha1,cont_model,cont_format) + SELECT rev_id,rev_text_id,rev_len,rev_sha1,cm_id,cf_id + FROM revision + LEFT JOIN content_model ON rev_content_model=cm_model + LEFT JOIN content_format ON rev_content_format=cm_format; + +-- Migrate slot associations... +INSERT INTO slots (slot_revision,slot_content,slot_role) + SELECT rev_id,rev_id,sr_id + FROM revision + LEFT JOIN slot_role ON sr_role='default'; + +-- +-- Cleanup: +-- Once everything is migrated, we can remove the obsolete fields from +-- the revision table and their indexes. +-- +-- Production upgrade would run this on replicas when they're out of rotation. +-- + +-- Drop old fields +ALTER TABLE revision + DROP KEY user_timestamp, + DROP KEY usertext_timestamp, + DROP KEY page_user_timestamp, + + DROP COLUMN rev_comment, + DROP COLUMN rev_text_id, + DROP COLUMN rev_user, + DROP COLUMN rev_user_text, + DROP COLUMN rev_len, + DROP COLUMN rev_sha1, + DROP COLUMN rev_content_model, + DROP COLUMN rev_content_format +; diff --git a/maintenance/tables.sql b/maintenance/tables.sql index 38fef45..fec2bff 100644 --- a/maintenance/tables.sql +++ b/maintenance/tables.sql @@ -323,28 +323,19 @@ -- CREATE TABLE /*_*/revision ( -- Unique ID to identify each revision - rev_id int unsigned NOT NULL PRIMARY KEY AUTO_INCREMENT, + rev_id bigint unsigned NOT NULL PRIMARY KEY AUTO_INCREMENT, -- Key to page_id. This should _never_ be invalid. rev_page int unsigned NOT NULL, - -- Key to text.old_id, where the actual bulk text is stored. - -- It's possible for multiple revisions to use the same text, - -- for instance revisions where only metadata is altered - -- or a rollback to a previous version. - rev_text_id int unsigned NOT NULL, + -- Key to comment.comment_id, where the actual comment text is stored. + -- It's possible for multiple revisions to use the same comment reference, + -- or for this to be NULL if no comment is associated. + rev_comment_id bigint, - -- Text comment summarizing the change. - -- This text is shown in the history and other changes lists, - -- rendered in a subset of wiki markup by Linker::formatComment() - rev_comment varbinary(767) NOT NULL, - - -- Key to user.user_id of the user who made this edit. - -- Stores 0 for anonymous edits and for some mass imports. - rev_user int unsigned NOT NULL default 0, - - -- Text username or IP address of the editor. - rev_user_text varchar(255) binary NOT NULL default '', + -- Key to actor.actor_id of the user who made this edit. + -- actor_user or actor_text will have the actual id or IP address. + rev_user_entry int unsigned NOT NULL default 0, -- Timestamp of when revision was created rev_timestamp binary(14) NOT NULL default '', @@ -356,21 +347,9 @@ -- Restrictions on who can access this revision rev_deleted tinyint unsigned NOT NULL default 0, - -- Length of this revision in bytes - rev_len int unsigned, - -- Key to revision.rev_id -- This field is used to add support for a tree structure (The Adjacency List Model) rev_parent_id int unsigned default NULL, - - -- SHA-1 text content hash in base-36 - rev_sha1 varbinary(32) NOT NULL default '', - - -- content model, see CONTENT_MODEL_XXX constants - rev_content_model varbinary(32) DEFAULT NULL, - - -- content format, see CONTENT_FORMAT_XXX constants - rev_content_format varbinary(64) DEFAULT NULL ) /*$wgDBTableOptions*/ MAX_ROWS=10000000 AVG_ROW_LENGTH=1024; -- In case tables are created as MyISAM, use row hints for MySQL <5.0 to avoid 4GB limit @@ -378,9 +357,123 @@ CREATE INDEX /*i*/rev_page_id ON /*_*/revision (rev_page, rev_id); CREATE INDEX /*i*/rev_timestamp ON /*_*/revision (rev_timestamp); CREATE INDEX /*i*/page_timestamp ON /*_*/revision (rev_page,rev_timestamp); -CREATE INDEX /*i*/user_timestamp ON /*_*/revision (rev_user,rev_timestamp); -CREATE INDEX /*i*/usertext_timestamp ON /*_*/revision (rev_user_text,rev_timestamp); -CREATE INDEX /*i*/page_user_timestamp ON /*_*/revision (rev_page,rev_user,rev_timestamp); +CREATE INDEX /*i*/actor_timestamp ON /*_*/revision (rev_actor,rev_timestamp); +CREATE INDEX /*i*/page_actor_timestamp ON /*_*/revision (rev_page,rev_actor,rev_timestamp); + +-- +-- Revisions are usually marked with a textual comment describing the change. +-- They are stored in the comment table to keep revision more compact, +-- and potentially to allow combining entries for identical comments. +-- +CREATE TABLE /*_*/comment( + -- Unique ID to identify each comment + comment_id bigint unsigned NOT NULL PRIMARY KEY AUTO_INCREMENT, + + -- Text comment summarizing the change. + -- This text is shown in the history and other changes lists, + -- rendered in a subset of wiki markup by Linker::formatComment() + -- Size limits are enforced at the application level, and should + -- take care to crop UTF-8 strings appropriately. + comment_text MEDIUMBLOB NOT NULL +) /*$wgDBTableOptions*/; + +-- +-- Each revision references one or more content objects via the +-- slots table, ending up here. +-- +CREATE TABLE /*_*/content ( + cont_id bigint unsigned NOT NULL PRIMARY KEY AUTO_INCREMENT, + + -- Key to text.old_id, where the actual bulk text is stored. + -- It's possible for multiple revisions to use the same text, + -- for instance revisions where only metadata is altered + -- or a rollback to a previous version. + cont_text_id bigint unsigned NOT NULL, + + -- Length of this content in bytes + cont_len int unsigned, + + -- SHA-1 text content hash in base-36 + cont_sha1 varbinary(32) NOT NULL DEFAULT '', + + -- content model, keys to content_model.cm_id for cm_model + -- See CONTENT_MODEL_XXX constants + cont_model smallint NOT NULL DEFAULT 0, + + -- content format, keys to content_format.cf_id for cf_format + -- See CONTENT_FORMT_XXX constants + cont_format smallint NOT NULL DEFAULT 0 +) /*$wgDBTableOptions*/; + + +-- +-- Slots are the association between a revision and one of its content objects. +-- Classic wiki pages may use only a single, default slot but more complex +-- data types may associate multiple content objects. +-- +CREATE TABLE /*_*/slots ( + -- Key to revision.rev_id + slot_revision BIGINT UNSIGNED NOT NULL, + + -- Key to content.cont_id + slot_content BIGINT UNSIGNED NOT NULL, + + -- Key to content_roles.cr_id + slot_role SMALLINT UNSIGNED NOT NULL, + + PRIMARY KEY (slot_revision, slot_role, slot_content) +) /*$wgDBTableOptions*/; + +CREATE UNIQUE INDEX /*i*/slot_revision_role ON /*_*/slots (slot_revision, slot_role); + + +-- +-- Mapping table for content slot roles. +-- +CREATE TABLE /*_*/content_roles ( + -- Unique ID for each role, used in slots.slot_role + cr_id SMALLINT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, + + -- Textual role name + cr_role VARBINARY(32) NOT NULL DEFAULT '' +) /*$wgDBTableOptions*/; + +CREATE UNIQUE INDEX /*i*/cr_role ON /*_*/content_roles (cr_role); + + +-- +-- Mapping table for content models. +-- A content model represents the conceptual data model of the underlying +-- page content, such as wikitext or javascript or a graph definition. +-- +CREATE TABLE /*_*/content_model ( + -- Unique ID for each content model, used in content.cont_model + cm_id smallint NOT NULL PRIMARY KEY AUTO_INCREMENT, + + -- Textual content model name + -- See CONTENT_MODEL_XXX constants + cm_model varbinary(32) NOT NULL DEFAULT '' +) /*$wgDBTableOptions*/; + +CREATE UNIQUE INDEX /*i*/cm_model ON /*_*/content_model (cm_model); + + +-- +-- Mapping table for content formats. +-- A content format represents the in-DB representation of the underlying +-- page content, such as wikitext or JSON. +-- +CREATE TABLE /*_*/content_format ( + -- Unique ID for each content format, used in content.cont_format + cf_id smallint NOT NULL PRIMARY KEY AUTO_INCREMENT, + + -- Textual content format name + -- See CONTENT_FORMT_XXX constants + cf_format varbinary(64) NOT NULL DEFAULT '' +) /*$wgDBTableOptions*/; + +CREATE UNIQUE INDEX /*i*/cf_format ON /*_*/content_format (cf_format); + -- -- Holds text of individual page revisions. -- To view, visit https://gerrit.wikimedia.org/r/350097 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I50ee6506ff71e7613e50c33d865199b07918c1de Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Brion VIBBER <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
