Smalyshev has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/372905 )
Change subject: Create update SPARQL for category changes
......................................................................
Create update SPARQL for category changes
Bug: T173774
Change-Id: I9867ad566c0619b55a48a011bd3c55321b1bfcff
---
M autoload.php
M maintenance/CategoriesRdf.php
A maintenance/categoryChangesAsRdf.php
3 files changed, 427 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core
refs/changes/05/372905/1
diff --git a/autoload.php b/autoload.php
index 80e0cd3..d2191f2 100644
--- a/autoload.php
+++ b/autoload.php
@@ -221,6 +221,7 @@
'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php',
'CategoriesRdf' => __DIR__ . '/maintenance/CategoriesRdf.php',
'Category' => __DIR__ . '/includes/Category.php',
+ 'CategoryChangesAsRdf' => __DIR__ .
'/maintenance/categoryChangesAsRdf.php',
'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php',
'CategoryMembershipChange' => __DIR__ .
'/includes/changes/CategoryMembershipChange.php',
'CategoryMembershipChangeJob' => __DIR__ .
'/includes/jobqueue/jobs/CategoryMembershipChangeJob.php',
diff --git a/maintenance/CategoriesRdf.php b/maintenance/CategoriesRdf.php
index 8e93f20..bac011b 100644
--- a/maintenance/CategoriesRdf.php
+++ b/maintenance/CategoriesRdf.php
@@ -74,6 +74,15 @@
}
/**
+ * Make URL from title label
+ * @param $titleLabel
+ * @return string
+ */
+ public function labelToUrl( $titleLabel ) {
+ return $this->titleToUrl( Title::makeTitle( NS_CATEGORY,
$titleLabel ) );
+ }
+
+ /**
* Convert Title to link to target page.
* @param Title $title
* @return string
diff --git a/maintenance/categoryChangesAsRdf.php
b/maintenance/categoryChangesAsRdf.php
new file mode 100644
index 0000000..c5fb13a
--- /dev/null
+++ b/maintenance/categoryChangesAsRdf.php
@@ -0,0 +1,417 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+use Wikimedia\Purtle\RdfWriter;
+use Wikimedia\Purtle\TurtleRdfWriter;
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script to provide RDF representation of the recent changes in
category tree.
+ *
+ * @ingroup Maintenance
+ * @since 1.30
+ */
+class CategoryChangesAsRdf extends Maintenance {
+ /**
+ * Insert query
+ */
+ const SPARQL_INSERT = <<<SPARQL
+INSERT DATA {
+%s
+};
+
+SPARQL;
+
+ /**
+ * Delete/Insert query
+ */
+ const SPARQL_DELETE_INSERT = <<<SPARQLDI
+DELETE {
+ ?category ?x ?y
+}
+INSERT {
+ %s
+}
+WHERE {
+ VALUES ?category {
+ %s
+ }
+};
+
+SPARQLDI;
+
+ /**
+ * @var RdfWriter
+ */
+ private $rdfWriter;
+ /**
+ * Categories RDF helper.
+ * @var CategoriesRdf
+ */
+ private $categoriesRdf;
+
+ public function __construct() {
+ parent::__construct();
+
+ $this->addDescription( "Generate RDF dump of category changes
in a wiki." );
+
+ $this->setBatchSize( 200 );
+ $this->addOption( 'output', "Output file (default is stdout).
Will be overwritten.",
+ false, true );
+ $this->addOption( 's', 'Starting timestamp (inclusive)', true,
true );
+ $this->addOption( 'e', 'Ending timestamp (exclusive)', true,
true );
+ }
+
+ public function execute() {
+ $outFile = $this->getOption( 'output', 'php://stdout' );
+
+ if ( $outFile === '-' ) {
+ $outFile = 'php://stdout';
+ }
+
+ $output = fopen( $outFile, 'w' );
+ // SPARQL Update is close to TTL
+ $this->rdfWriter = new TurtleRdfWriter();
+ $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
+
+ $this->categoriesRdf->setupPrefixes();
+ $this->rdfWriter->start();
+
+ $prefixes = $this->rdfWriter->drain();
+ // we have to strip @ from prefix, since SPARQL UPDATE doesn't
use them
+ $prefixes = preg_replace( '/^@/m', '', $prefixes );
+ fwrite( $output, $prefixes );
+
+ $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
+
+ $processed = []; // So we don't try to process same thins twice
+
+ // Handle deletes
+ // This only does "true" deletes - i.e. those that the page
stays deleted
+ foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
+ $deleteUrls = [];
+ foreach ( $batch as $row ) {
+ // This can produce duplicates, we don't care
+ $deleteUrls[] = '<' .
$this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+ $processed[$row->rc_cur_id] = true;
+ }
+ fwrite( $output, $this->getCategoriesUpdate( $dbr,
$deleteUrls, [] ) );
+ }
+
+ // Handle moves
+ // Moves go before additions because if category is moved, we
should not process creation
+ // as it would produce wrong data - because create row has old
title
+ foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
+ $pages = [];
+ $deleteUrls = [];
+ foreach ( $batch as $row ) {
+ $deleteUrls[] = '<' .
$this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+ if ( isset( $processed[$row->rc_cur_id] ) ) {
+ // We already captured this one before
+ continue;
+ }
+
+ if ( $row->page_namespace != NS_CATEGORY ) {
+ continue;
+ }
+ $this->categoriesRdf->writeCategoryData(
$row->page_title );
+ $pages[$row->rc_cur_id] = $row->page_title;
+ $processed[$row->rc_cur_id] = true;
+ }
+
+ fwrite( $output, $this->getCategoriesUpdate( $dbr,
$deleteUrls, $pages ) );
+ }
+
+ // Handle restores
+ // We need to handle restores too since delete may have
happened in previous update.
+ foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
+ $pages = [];
+ foreach ( $batch as $row ) {
+ if ( isset( $processed[$row->rc_cur_id] ) ) {
+ // We already captured this one before
+ continue;
+ }
+ $this->categoriesRdf->writeCategoryData(
$row->rc_title );
+ $pages[$row->rc_cur_id] = $row->rc_title;
+ $processed[$row->rc_cur_id] = true;
+ }
+
+ if ( empty( $pages ) ) {
+ continue;
+ }
+
+ $this->writeParentCategories( $dbr, $pages );
+
+ fwrite( $output, sprintf( self::SPARQL_INSERT,
$this->rdfWriter->drain() ) );
+ }
+
+ // Handle additions
+ foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
+ $pages = [];
+ foreach ( $batch as $row ) {
+ if ( isset( $processed[$row->rc_cur_id] ) ) {
+ // We already captured this one before
+ continue;
+ }
+ $this->categoriesRdf->writeCategoryData(
$row->rc_title );
+ $pages[$row->rc_cur_id] = $row->rc_title;
+ $processed[$row->rc_cur_id] = true;
+ }
+
+ if ( empty( $pages ) ) {
+ continue;
+ }
+
+ $this->writeParentCategories( $dbr, $pages );
+
+ fwrite( $output, sprintf( self::SPARQL_INSERT,
$this->rdfWriter->drain() ) );
+ }
+
+ // Handle changes
+ foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) {
+ $pages = [];
+ $deleteUrls = [];
+ foreach ( $batch as $row ) {
+ if ( isset( $processed[$row->rc_cur_id] ) ) {
+ // We already captured this one before
+ continue;
+ }
+ $this->categoriesRdf->writeCategoryData(
$row->rc_title );
+ $pages[$row->rc_cur_id] = $row->rc_title;
+ $processed[$row->rc_cur_id] = true;
+ $deleteUrls[] = '<' .
$this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+ }
+
+ fwrite( $output, $this->getCategoriesUpdate( $dbr,
$deleteUrls, $pages ) );
+ }
+
+ // Update timestamp
+ $to = $this->getOption( 'e' );
+ fwrite( $output, $this->updateTS( $to ) );
+ }
+
+ /**
+ * Get SPARQL for updating set of categories
+ * @param IDatabase $dbr
+ * @param string[] $deleteUrls List of URIs to be deleted, with <>
+ * @param string[] $pages List of categories: id => title
+ * @return string SPARQL query
+ */
+ private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls,
$pages ) {
+ if ( empty( $deleteUrls ) ) {
+ return "";
+ }
+
+ if (!empty($pages)) {
+ $this->writeParentCategories( $dbr, $pages );
+ }
+
+ return sprintf( self::SPARQL_DELETE_INSERT,
+ $this->rdfWriter->drain(),
+ join( ' ', $deleteUrls ) );
+ }
+
+ /**
+ * Write data for a set of categories
+ * @param IDatabase $dbr
+ * @param string[] $pages List of categories: id => title
+ */
+ private function writeParentCategories( IDatabase $dbr, $pages ) {
+ foreach ( $this->getCategoryLinksIterator( $dbr, array_keys(
$pages ) ) as $row ) {
+ $this->categoriesRdf->writeCategoryLinkData(
$pages[$row->cl_from], $row->cl_to );
+ }
+ }
+
+ /**
+ * Update timestamp
+ * @param string|int $timestamp Timestamp for last change
+ * @return string SPARQL Update query for timestamp.
+ */
+ public function updateTS( $timestamp ) {
+ $dumpUrl = '<' . wfExpandUrl( '/categoriesDump',
PROTO_CANONICAL ) . '>';
+ $ts = wfTimestamp( TS_ISO_8601, $timestamp );
+ $tsQuery = <<<SPARQL
+DELETE {
+ $dumpUrl schema:dateModified ?o .
+}
+WHERE {
+ $dumpUrl schema:dateModified ?o .
+};
+INSERT DATA {
+ $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
+}
+
+SPARQL;
+ return $tsQuery;
+ }
+
+ /**
+ * Fetch newly created categories
+ * @param IDatabase $dbr
+ * @return BatchRowIterator
+ */
+ private function getNewCatsIterator( IDatabase $dbr ) {
+ $it = new BatchRowIterator( $dbr,
+ 'recentchanges',
+ [ 'rc_timestamp' ],
+ $this->mBatchSize
+ );
+ $this->addTimestampConditions( $it, $dbr );
+ $it->addConditions( [
+ 'rc_namespace' => NS_CATEGORY,
+ 'rc_new' => 1,
+ ] );
+ $it->setFetchColumns( [ 'rc_title', 'rc_cur_id' ] );
+ return $it;
+ }
+
+ /**
+ * Fetch moved categories
+ * @param IDatabase $dbr
+ * @return BatchRowIterator
+ */
+ private function getMovedCatsIterator( IDatabase $dbr ) {
+ $it = new BatchRowIterator( $dbr,
+ [ 'recentchanges', 'page' ],
+ [ 'rc_timestamp' ],
+ $this->mBatchSize
+ );
+ $this->addTimestampConditions( $it, $dbr );
+ $it->addConditions( [
+ 'rc_namespace' => NS_CATEGORY,
+ 'rc_new' => 0,
+ 'rc_log_type' => 'move',
+ 'rc_type' => RC_LOG,
+ ] );
+ $it->addJoinConditions( [ 'page' => [ 'INNER JOIN', 'rc_cur_id
= page_id' ] ] );
+ $it->setFetchColumns( [ 'page_title', 'page_namespace',
'rc_cur_id', 'rc_title' ] );
+ return $it;
+ }
+
+ /**
+ * Fetch deleted categories
+ * @param IDatabase $dbr
+ * @return BatchRowIterator
+ */
+ private function getDeletedCatsIterator( IDatabase $dbr ) {
+ $it = new BatchRowIterator( $dbr,
+ [ 'recentchanges', 'page' ],
+ [ 'rc_timestamp' ],
+ $this->mBatchSize
+ );
+ $this->addTimestampConditions( $it, $dbr );
+ $it->addConditions( [
+ 'rc_namespace' => NS_CATEGORY,
+ 'rc_new' => 0,
+ 'rc_log_type' => 'delete',
+ 'rc_log_action' => 'delete',
+ 'rc_type' => RC_LOG,
+ 'page_id IS NULL'
+ ] );
+ // We will fetch ones that do not have page record. If they do,
+ // this means they were restored, thus restoring handler will
pick it up.
+ $it->addJoinConditions( [ 'page' => [ 'LEFT JOIN', 'rc_cur_id =
page_id' ] ] );
+ $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
+ return $it;
+ }
+
+ /**
+ * Fetch restored categories
+ * @param IDatabase $dbr
+ * @return BatchRowIterator
+ */
+ private function getRestoredCatsIterator( IDatabase $dbr ) {
+ $it = new BatchRowIterator( $dbr,
+ [ 'recentchanges', 'page' ],
+ [ 'rc_timestamp' ],
+ $this->mBatchSize
+ );
+ $this->addTimestampConditions( $it, $dbr );
+ $it->addConditions( [
+ 'rc_namespace' => NS_CATEGORY,
+ 'rc_new' => 0,
+ 'rc_log_type' => 'delete',
+ 'rc_log_action' => 'restore',
+ 'rc_type' => RC_LOG,
+ ] );
+ // We will only fetch ones that have page record
+ $it->addJoinConditions( [ 'page' => [ 'INNER JOIN', 'rc_cur_id
= page_id' ] ] );
+ $it->setFetchColumns( [ 'rc_cur_id', 'rc_title', 'page_title' ]
);
+ return $it;
+ }
+
+ /**
+ * Fetch categorization changes
+ * @param IDatabase $dbr
+ * @return BatchRowIterator
+ */
+ private function getChangedCatsIterator( IDatabase $dbr ) {
+ $it = new BatchRowIterator( $dbr,
+ 'recentchanges',
+ [ 'rc_timestamp' ],
+ $this->mBatchSize
+ );
+ $this->addTimestampConditions( $it, $dbr );
+ $it->addConditions( [
+ 'rc_namespace' => NS_CATEGORY,
+ 'rc_new' => 0,
+ 'rc_type' => RC_EDIT,
+ ] );
+ $it->setFetchColumns( [ 'rc_title', 'rc_cur_id' ] );
+ return $it;
+ }
+
+ /**
+ * Add timestamp limits to iterator
+ * @param BatchRowIterator $it Iterator
+ * @param IDatabase $dbr
+ */
+ private function addTimestampConditions( BatchRowIterator $it,
IDatabase $dbr ) {
+ $it->addConditions([
+ 'rc_timestamp >= ' . $dbr->timestamp(
$this->getOption('s') ),
+ 'rc_timestamp < ' . $dbr->timestamp(
$this->getOption('e') ),
+ ]);
+ }
+
+ /**
+ * Get iterator for links for categories.
+ * @param IDatabase $dbr
+ * @param array $ids List of page IDs
+ * @return Traversable
+ */
+ public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
+ $it = new BatchRowIterator(
+ $dbr,
+ 'categorylinks',
+ [ 'cl_from', 'cl_to' ],
+ $this->mBatchSize
+ );
+ $it->addConditions( [
+ 'cl_type' => 'subcat',
+ 'cl_from' => $ids
+ ] );
+ $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
+ return new RecursiveIteratorIterator( $it );
+ }
+
+}
+
+$maintClass = "CategoryChangesAsRdf";
+require_once RUN_MAINTENANCE_IF_MAIN;
--
To view, visit https://gerrit.wikimedia.org/r/372905
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I9867ad566c0619b55a48a011bd3c55321b1bfcff
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits