Hiong3-eng5 has uploaded a new change for review. https://gerrit.wikimedia.org/r/96929
Change subject: Remove Duplicate Expressions ...................................................................... Remove Duplicate Expressions Find, adjust, remove. This does not delete the duplicate entry, just maked removed. In the future, when the dm expression is not used, these duplicates should be deleted. As a precaution, The reviveExpression function was changed to limit the revival of expressions to 1 expression. Some duplicates like Tala have the same transaction id, so some expressions could be from the trailing space bug. Some may have been from reviveEpression. btw, Micheal's list must be rechecked in OmegaWiki after this script is used. Like Tala is the same definition yet with conflicting annotations. I have checked SQL to see if the duplicates were deleted, checked if the entries are ok, deleted all groundnut expression to see if the expression is removed, then entered them again to see if the expression is revived. The SQL said its all good, maybe you could double check this? Thanks. Change-Id: Id70f8ed1ea392190d3b07f30815c6dfabf16c30a --- A Console/removeDuplicateExpression.php M OmegaWiki/WikiDataAPI.php 2 files changed, 263 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikiLexicalData refs/changes/29/96929/1 diff --git a/Console/removeDuplicateExpression.php b/Console/removeDuplicateExpression.php new file mode 100644 index 0000000..ea27b8d --- /dev/null +++ b/Console/removeDuplicateExpression.php @@ -0,0 +1,261 @@ +<?php + +/** +* Maintenance script to remove duplicate expessions +*/ + +$baseDir = dirname( __FILE__ ) . '/../../..' ; +require_once( $baseDir . '/maintenance/Maintenance.php' ); +require_once( $baseDir . '/extensions/WikiLexicalData/OmegaWiki/WikiDataGlobals.php' ); + +echo "start\n"; + +class RemoveDuplicateExpressions extends Maintenance { + + public function __construct() { + parent::__construct(); + $this->mDescription = "Maintenance tool to remove duplicated expressions\n" + . 'Example usage: php removeDuplicateExpression.php --test=true ' . "\n" + . ' or simply' . "\n" + . 'php removeDuplicateExpression.php' . "\n"; + $this->addOption( 'test', 'true for test mode. e.g. --test=true' ); + } + + public function execute() { + + global $wdCurrentContext; + + $this->test = false; + // $this->test = true; + if ( $this->hasOption( 'test' ) ) { + $this->test = true; + } + + $this->output( "Starting remove duplicate expressions function...\n" ); + // check if there are duplicates greater than two + $this->output( "Finding duplicates\n" ); + $duplicates = $this->getDuplicates(); + + $haveDuplicates = 0; + $syntransHaveDuplicates = 0; + $sid = array(); + if ( $duplicates ) { + $haveDuplicates = 1; + foreach ( $duplicates as $rows ) { + $expression = $this->getSpellingExpressionId( $rows['spelling'], $rows['language_id'] ); + $this->output( "process {$rows['spelling']} ({$rows['language_id']}) - expression id: original is {$expression[0]}; duplicate is {$expression[1]}\n"); + $syntrans = $this->getSyntransToUpdate( $expression ); + + if ( $syntrans ) { + $syntransHaveDuplicates = 1; + foreach( $syntrans as $sids ) { + $sid[] = $sids; + if ( !$this->test ) { + // correct the duplication + $this->correctDuplication( $sids, $expression ); + } + } + } + + if ( !$this->test ) { + // remove the duplicate + $this->output( "removing duplicate id {$expression[1]}\n"); + $this->deleteDuplicate( $expression[1], $rows['language_id'] ); + + } + + } + } + + if ( $sid ) { + $totalSids = count( $sid ); + $this->output( "There are a total of {$totalSids} corrected\n"); + } + + if ( !$haveDuplicates ) { + $this->output( "Congratulations! No duplicates found\n" ); + return true; + } + + if ( !$syntransHaveDuplicates ) { + $this->output( "Congratulations! No syntrans have the duplicate expressions\n" ); + } + + } + + protected function deleteDuplicate( $expressionId, $languageId, $dc = null ) { + if ( is_null( $dc ) ) { + $dc = wdGetDataSetContext(); + } + $dbr = wfGetDB( DB_SLAVE ); + + $cond = null; + + // remove instead of delete. Lazier way out... for now. + // reviving expression is now limited to one expression + // to avoid duplicates from being revived. + // Adding to TODO list ~ he + +/* $queryResult = $dbr->delete( + "{$dc}_expression", + array( + 'remove_transaction_id' => null, + 'expression_id' => $expressionId, + 'language_id' => $languageId + ), + __METHOD__ + ); + +*/ + + $transactionId = getUpdateTransactionId(); + $queryResult = $dbr->update( + "{$dc}_expression", + array( + 'remove_transaction_id' => $transactionId, + ), + array( + 'remove_transaction_id' => null, + 'expression_id' => $expressionId, + ), + __METHOD__, + $cond + ); + } + + protected function correctDuplication( $syntransSid, $expressionId, $dc = null ) { + if ( is_null( $dc ) ) { + $dc = wdGetDataSetContext(); + } + $dbr = wfGetDB( DB_SLAVE ); + + $cond = null; + + $queryResult = $dbr->update( + "{$dc}_syntrans", + array( + 'expression_id' => $expressionId[0], + ), + array( + 'remove_transaction_id' => null, + 'expression_id' => $expressionId[1], + 'syntrans_sid' => $syntransSid + ), + __METHOD__, + $cond + ); + + } + + protected function getSyntransToUpdate( $expressionIds, $dc = null ) { + if ( is_null( $dc ) ) { + $dc = wdGetDataSetContext(); + } + $dbr = wfGetDB( DB_SLAVE ); + + $cond = null; + + $queryResult = $dbr->select( + "{$dc}_syntrans", + array( + 'syntrans_sid', + ), + array( + 'remove_transaction_id' => null, + 'expression_id' => $expressionIds[1] + ), + __METHOD__, + $cond + ); + + $sid = array(); + foreach ( $queryResult as $sids ) { + $sid[] = $sids->syntrans_sid; + } + + if ( $sid ) { + return $sid; + } + return array(); + } + + protected function getSpellingExpressionId( $spelling, $languageId, $dc = null ) { + if ( is_null( $dc ) ) { + $dc = wdGetDataSetContext(); + } + $dbr = wfGetDB( DB_SLAVE ); + + $cond['ORDER BY'] = 'expression_id'; + $cond['LIMIT']= 2; + + $queryResult = $dbr->select( + "{$dc}_expression", + 'expression_id', + array( + 'remove_transaction_id' => null, + 'spelling' => $spelling, + 'language_id' => $languageId + ), + __METHOD__, + $cond + ); + + $expressionId = array(); + foreach ( $queryResult as $expressionIds ) { + $expressionId[] = $expressionIds->expression_id; + } + + if ( $expressionId ) { + return $expressionId; + } + return array(); + + } + + protected function getDuplicates( $dc = null ) { + if ( is_null( $dc ) ) { + $dc = wdGetDataSetContext(); + } + $dbr = wfGetDB( DB_SLAVE ); + + $cond['ORDER BY'] = 'count(spelling) DESC'; + $cond['GROUP BY'] = array( + 'spelling', + 'language_id' + ); + + $queryResult = $dbr->select( + "{$dc}_expression", + array( + 'spelling', + 'language_id', + 'number' => 'count(spelling)' + ), + array( + 'remove_transaction_id' => null + ), + __METHOD__, + $cond + ); + + $duplicates = array(); + foreach ( $queryResult as $dup ) { + if ( $dup->number > 1 ) { + $duplicates[] = array( + 'spelling' => $dup->spelling, + 'language_id' => $dup->language_id + ); + } + } + + if ( $duplicates ) { + return $duplicates; + } + return array(); + + } + +} + +$maintClass = 'RemoveDuplicateExpressions'; +require_once( RUN_MAINTENANCE_IF_MAIN ); diff --git a/OmegaWiki/WikiDataAPI.php b/OmegaWiki/WikiDataAPI.php index 00a6295..836d3f2 100644 --- a/OmegaWiki/WikiDataAPI.php +++ b/OmegaWiki/WikiDataAPI.php @@ -168,7 +168,8 @@ 'remove_transaction_id' => null ), array( /* WHERE */ 'expression_id' => $expressionId - ), __METHOD__ + ), __METHOD__, + array( 'LIMIT' => 1 ) ); } -- To view, visit https://gerrit.wikimedia.org/r/96929 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id70f8ed1ea392190d3b07f30815c6dfabf16c30a Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikiLexicalData Gerrit-Branch: master Gerrit-Owner: Hiong3-eng5 <hiong3.e...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits