Hiong3-eng5 has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/96929


Change subject: Remove Duplicate Expressions
......................................................................

Remove Duplicate Expressions

Find, adjust, remove.

This does not delete the duplicate entry, just maked removed. In the
future, when the dm expression is not used, these duplicates should be
deleted. As a precaution, The reviveExpression function was changed to
limit the revival of expressions to 1 expression.

Some duplicates like Tala have the same transaction id, so some
expressions could be from the trailing space bug. Some may have been from
reviveEpression.  btw, Micheal's list must be rechecked in OmegaWiki
after this script is used. Like Tala is the same definition yet with
conflicting annotations.

I have checked SQL to see if the duplicates were deleted, checked if the
entries are ok, deleted all groundnut expression to see if the expression
is removed, then entered them again to see if the expression is revived.
The SQL said its all good, maybe you could double check this? Thanks.

Change-Id: Id70f8ed1ea392190d3b07f30815c6dfabf16c30a
---
A Console/removeDuplicateExpression.php
M OmegaWiki/WikiDataAPI.php
2 files changed, 263 insertions(+), 1 deletion(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikiLexicalData 
refs/changes/29/96929/1

diff --git a/Console/removeDuplicateExpression.php 
b/Console/removeDuplicateExpression.php
new file mode 100644
index 0000000..ea27b8d
--- /dev/null
+++ b/Console/removeDuplicateExpression.php
@@ -0,0 +1,261 @@
+<?php
+
+/**
+* Maintenance script to remove duplicate expessions
+*/
+
+$baseDir = dirname( __FILE__ ) . '/../../..' ;
+require_once( $baseDir . '/maintenance/Maintenance.php' );
+require_once( $baseDir . 
'/extensions/WikiLexicalData/OmegaWiki/WikiDataGlobals.php' );
+
+echo "start\n";
+
+class RemoveDuplicateExpressions extends Maintenance {
+
+       public function __construct() {
+               parent::__construct();
+               $this->mDescription = "Maintenance tool to remove duplicated 
expressions\n"
+                       . 'Example usage: php removeDuplicateExpression.php 
--test=true ' . "\n"
+                       . ' or simply' . "\n"
+                       . 'php removeDuplicateExpression.php' . "\n";
+               $this->addOption( 'test', 'true for test mode. e.g. 
--test=true' );
+       }
+
+       public function execute() {
+
+               global $wdCurrentContext;
+
+               $this->test = false;
+       //      $this->test = true;
+               if ( $this->hasOption( 'test' ) ) {
+                       $this->test = true;
+               }
+
+               $this->output( "Starting remove duplicate expressions 
function...\n" );
+               // check if there are duplicates greater than two
+               $this->output( "Finding duplicates\n" );
+               $duplicates = $this->getDuplicates();
+
+               $haveDuplicates = 0;
+               $syntransHaveDuplicates = 0;
+               $sid = array();
+               if ( $duplicates ) {
+                       $haveDuplicates = 1;
+                       foreach ( $duplicates as $rows ) {
+                               $expression = $this->getSpellingExpressionId( 
$rows['spelling'], $rows['language_id'] );
+                               $this->output( "process {$rows['spelling']} 
({$rows['language_id']}) - expression id: original is {$expression[0]}; 
duplicate is {$expression[1]}\n");
+                               $syntrans = $this->getSyntransToUpdate( 
$expression );
+
+                               if ( $syntrans ) {
+                                       $syntransHaveDuplicates = 1;
+                                       foreach( $syntrans as $sids ) {
+                                               $sid[] = $sids;
+                                               if ( !$this->test ) {
+                                                       // correct the 
duplication
+                                                       
$this->correctDuplication( $sids, $expression );
+                                               }
+                                       }
+                               }
+
+                               if ( !$this->test ) {
+                                       // remove the duplicate
+                                       $this->output( "removing duplicate id 
{$expression[1]}\n");
+                                       $this->deleteDuplicate( $expression[1], 
$rows['language_id'] );
+
+                               }
+
+                       }
+               }
+
+               if ( $sid ) {
+                       $totalSids = count( $sid );
+                       $this->output( "There are a total of {$totalSids} 
corrected\n");
+               }
+
+               if ( !$haveDuplicates ) {
+                       $this->output( "Congratulations! No duplicates found\n" 
);
+                       return true;
+               }
+
+               if ( !$syntransHaveDuplicates ) {
+                       $this->output( "Congratulations! No syntrans have the 
duplicate expressions\n" );
+               }
+
+       }
+
+       protected function deleteDuplicate( $expressionId, $languageId, $dc = 
null ) {
+               if ( is_null( $dc ) ) {
+                       $dc = wdGetDataSetContext();
+               }
+               $dbr = wfGetDB( DB_SLAVE );
+
+               $cond = null;
+
+               // remove instead of delete. Lazier way out... for now.
+               // reviving expression is now limited to one expression
+               // to avoid duplicates from being revived.
+               // Adding to TODO list ~ he
+
+/*             $queryResult = $dbr->delete(
+                       "{$dc}_expression",
+                       array(
+                               'remove_transaction_id' => null,
+                               'expression_id' => $expressionId,
+                               'language_id' => $languageId
+                       ),
+                       __METHOD__
+               );
+
+*/
+
+               $transactionId = getUpdateTransactionId();
+               $queryResult = $dbr->update(
+                       "{$dc}_expression",
+                       array(
+                               'remove_transaction_id' => $transactionId,
+                       ),
+                       array(
+                               'remove_transaction_id' => null,
+                               'expression_id' => $expressionId,
+                       ),
+                       __METHOD__,
+                       $cond
+               );
+       }
+
+       protected function correctDuplication( $syntransSid, $expressionId, $dc 
= null ) {
+               if ( is_null( $dc ) ) {
+                       $dc = wdGetDataSetContext();
+               }
+               $dbr = wfGetDB( DB_SLAVE );
+
+               $cond = null;
+
+               $queryResult = $dbr->update(
+                       "{$dc}_syntrans",
+                       array(
+                               'expression_id' => $expressionId[0],
+                       ),
+                       array(
+                               'remove_transaction_id' => null,
+                               'expression_id' => $expressionId[1],
+                               'syntrans_sid' => $syntransSid
+                       ),
+                       __METHOD__,
+                       $cond
+               );
+
+       }
+
+       protected function getSyntransToUpdate( $expressionIds, $dc = null ) {
+               if ( is_null( $dc ) ) {
+                       $dc = wdGetDataSetContext();
+               }
+               $dbr = wfGetDB( DB_SLAVE );
+
+               $cond = null;
+
+               $queryResult = $dbr->select(
+                       "{$dc}_syntrans",
+                       array(
+                               'syntrans_sid',
+                       ),
+                       array(
+                               'remove_transaction_id' => null,
+                               'expression_id' => $expressionIds[1]
+                       ),
+                       __METHOD__,
+                       $cond
+               );
+
+               $sid = array();
+               foreach ( $queryResult as $sids ) {
+                       $sid[] = $sids->syntrans_sid;
+               }
+
+               if ( $sid ) {
+                       return $sid;
+               }
+               return array();
+       }
+
+       protected function getSpellingExpressionId( $spelling, $languageId, $dc 
= null ) {
+               if ( is_null( $dc ) ) {
+                       $dc = wdGetDataSetContext();
+               }
+               $dbr = wfGetDB( DB_SLAVE );
+
+               $cond['ORDER BY'] = 'expression_id';
+               $cond['LIMIT']= 2;
+
+               $queryResult = $dbr->select(
+                       "{$dc}_expression",
+                       'expression_id',
+                       array(
+                               'remove_transaction_id' => null,
+                               'spelling' => $spelling,
+                               'language_id' => $languageId
+                       ),
+                       __METHOD__,
+                       $cond
+               );
+
+               $expressionId = array();
+               foreach ( $queryResult as $expressionIds ) {
+                       $expressionId[] = $expressionIds->expression_id;
+               }
+
+               if ( $expressionId ) {
+                       return $expressionId;
+               }
+               return array();
+
+       }
+
+       protected function getDuplicates( $dc = null ) {
+               if ( is_null( $dc ) ) {
+                       $dc = wdGetDataSetContext();
+               }
+               $dbr = wfGetDB( DB_SLAVE );
+
+               $cond['ORDER BY'] = 'count(spelling) DESC';
+               $cond['GROUP BY'] = array(
+                       'spelling',
+                       'language_id'
+               );
+
+               $queryResult = $dbr->select(
+                       "{$dc}_expression",
+                       array(
+                               'spelling',
+                               'language_id',
+                               'number' => 'count(spelling)'
+                       ),
+                       array(
+                               'remove_transaction_id' => null
+                       ),
+                       __METHOD__,
+                       $cond
+               );
+
+               $duplicates = array();
+               foreach ( $queryResult as $dup ) {
+                       if ( $dup->number > 1 ) {
+                               $duplicates[] = array(
+                                       'spelling' => $dup->spelling,
+                                       'language_id' => $dup->language_id
+                               );
+                       }
+               }
+
+               if ( $duplicates ) {
+                       return $duplicates;
+               }
+               return array();
+
+       }
+
+}
+
+$maintClass = 'RemoveDuplicateExpressions';
+require_once( RUN_MAINTENANCE_IF_MAIN );
diff --git a/OmegaWiki/WikiDataAPI.php b/OmegaWiki/WikiDataAPI.php
index 00a6295..836d3f2 100644
--- a/OmegaWiki/WikiDataAPI.php
+++ b/OmegaWiki/WikiDataAPI.php
@@ -168,7 +168,8 @@
                        'remove_transaction_id' => null
                ), array( /* WHERE */
                        'expression_id' => $expressionId
-               ), __METHOD__
+               ), __METHOD__,
+               array( 'LIMIT' => 1 )
        );
 }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/96929
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id70f8ed1ea392190d3b07f30815c6dfabf16c30a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikiLexicalData
Gerrit-Branch: master
Gerrit-Owner: Hiong3-eng5 <hiong3.e...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to