Nikerabbit has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/271217

Change subject: First steps towards dumps
......................................................................

First steps towards dumps

A flexible script to provide JSON dumps.

Bug: T122042
Change-Id: Ie3b88c86283a6290ceb95b59b4b4cb7e748ab0ed
---
A scripts/dump-corpora.php
1 file changed, 208 insertions(+), 0 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/ContentTranslation 
refs/changes/17/271217/1

diff --git a/scripts/dump-corpora.php b/scripts/dump-corpora.php
new file mode 100644
index 0000000..f67332a
--- /dev/null
+++ b/scripts/dump-corpora.php
@@ -0,0 +1,208 @@
+<?php
+/**
+ *
+ * @file
+ * @author Niklas Laxström
+ * @license GPL-2.0+
+ */
+
+// Standard boilerplate to define $IP
+if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
+       $IP = getenv( 'MW_INSTALL_PATH' );
+} else {
+       $dir = __DIR__;
+       $IP = "$dir/../../..";
+}
+require_once "$IP/maintenance/Maintenance.php";
+
+use ContentTranslation\CorporaLookup;
+use ContentTranslation\Database;
+use ContentTranslation\Translation;
+
+
+class CxDump extends Maintenance {
+       public function __construct() {
+               parent::__construct();
+               $this->mDescription = 'Script to fix some cx stats numbers.';
+
+               $this->addOption(
+                       'source-language',
+                       '(optional) Source language',
+                       false, /*required*/
+                       true /*has arg*/
+               );
+
+               $this->addOption(
+                       'source-language',
+                       '(optional) Source language',
+                       false, /*required*/
+                       true /*has arg*/
+               );
+
+               $this->addOption(
+                       'format',
+                       '(optional) Dump format. Defaults to JSON.',
+                       false, /*required*/
+                       true /*has arg*/
+               );
+
+               $this->addOption(
+                       'split-at',
+                       '(optional) If there are more than this published 
articles, also create split dumps.',
+                       false, /*required*/
+                       true /*has arg*/
+               );
+
+               $this->addOption(
+                       'plaintext',
+                       '(optional) Strip away html.'
+               );
+
+
+               $this->resets = array();
+               $this->tags = array();
+       }
+
+       public function execute() {
+               $sourceLanguage = $this->getOption( 'source-language', false );
+               $targetLanguage = $this->getOption( 'target-language', false );
+               $format = $this->getOption( 'format', 'json' );
+               $plain = $this->getOption( 'plaintext', false );
+               $split = $this->getOption( 'split-at', false );
+               $type = $plain ? 'text' : 'html';
+
+               $limit = 999999999;
+               $offset = 0;
+               $translations = Translation::getAllPublishedTranslations( 
$sourceLanguage, $targetLanguage, $limit, $offset );
+
+               // Fetch the actual interesting data
+               $db = Database::getConnection( DB_SLAVE );
+               $lookup = new CorporaLookup( $db );
+               foreach ( $translations as &$translation ) {
+                       $translation['corpora'] = $lookup->getByTranslationId( 
$translation['translationId'] );
+                       if ( $plain ) {
+                               foreach ( $translation['corpora'] as $id => 
$unit ) {
+                                       foreach ( $unit as $field => $value ) {
+                                               if ( !isset( $value['content'] 
) ) {
+                                                       continue;
+                                               }
+
+                                               
$translation['corpora'][$id][$field]['content'] =
+                                                       
Sanitizer::stripAllTags( $value['content'] );
+                                       }
+                               }
+                       }
+               }
+
+               if ( !$split ) {
+                       $source = $sourceLanguage ?: '_';
+                       $target = $targetLanguage ?: '_';
+                       $filename = 
"cx-corpora.{$source}2{$target}.$type.$format";
+                       $this->export( $format, $filename, $translations );
+
+                       return;
+               }
+
+               $sorted = $this->sortTranslations( $translations );
+               foreach ( $sorted as $targetLanguage => $sourceLanguages ) {
+                       foreach ( $sourceLanguages as $sourceLanguage => 
$targets ) {
+                               if ( count( $targets ) < $split ) {
+                                       continue;
+                               }
+
+                               $filename = 
"cx-corpora.{$sourceLanguage}2{$targetLanguage}.$type.$format";
+                               $this->export( $format, $filename, $targets );
+                               unset( 
$sorted[$targetLanguage][$sourceLanguage] );
+                       }
+
+                       // Check whether we exported everything
+                       if ( count( $sorted[$targetLanguage] ) === 0 ) {
+                               unset( $sorted[$targetLanguage] );
+                               continue;
+                       }
+
+                       // Flatten the rest
+                       $sorted[$targetLanguage] = call_user_func_array( 
'array_merge', $sorted[$targetLanguage] );
+                       // Export now if threshold is met, otherwise leave it 
to the end any2any.
+                       if ( count( $targets ) < $split ) {
+                               continue;
+                       }
+
+                       $filename = 
"cx-corpora._2{$targetLanguage}.$type.$format";
+                       $this->export( $format, $filename, $targets );
+                       unset( $sorted[$targetLanguage] );
+               }
+
+               if ( count( $sorted ) ) {
+                       $targets = call_user_func_array( 'array_merge', $sorted 
);
+                       $filename = "cx-corpora._2_.$type.$format";
+                       $this->export( $format, $filename, $targets );
+               }
+       }
+
+       public function sortTranslations( $translations ) {
+               $sorted = array();
+               foreach ( $translations as $translation ) {
+                       $sourceLanguage = $translation['sourceLanguage'];
+                       $targetLanguage = $translation['targetLanguage'];
+
+                       if ( !isset( $sorted[$targetLanguage] ) ) {
+                               $sorted[$targetLanguage] = array();
+                       }
+
+                       if ( !isset( $sorted[$targetLanguage][$sourceLanguage] 
) ) {
+                               $sorted[$targetLanguage][$sourceLanguage] = 
array();
+                       }
+
+                       $sorted[$targetLanguage][$sourceLanguage][] = 
$translation;
+               }
+
+               return $sorted;
+       }
+
+       public function export( $format, $filename, array $targets ) {
+               if ( $format !== 'json'  ) {
+                       $this->error( "Unknown output format\n", 1 );
+               }
+
+               $data = $this->formatJSON( $targets );
+               if ( $data ) {
+                       file_put_contents( $filename, $data );
+               }
+       }
+
+       public function formatJSON( array $targets ) {
+               $output = array();
+               foreach ( $targets as $translation ) {
+                       foreach ( $translation['corpora'] as $id => $unit ) {
+                               if ( !isset( $unit['user'] ) ) {
+                                       continue;
+                               }
+
+                               $sourceLanguage = 
$translation['sourceLanguage'];
+                               $targetLanguage = 
$translation['targetLanguage'];
+
+                               unset( $unit['source']['engine'], 
$unit['user']['engine'] );
+                               unset( $unit['source']['timestamp'], 
$unit['user']['timestamp'], $unit['mt']['timestamp'] );
+
+                               $globalId = 
"{$translation['translationId']}/$id";
+                               $output[] = array(
+                                       'id' => $id,
+                                       'languages' => "$sourceLanguage 
$targetLanguage",
+                                       'source' => $unit['source'],
+                                       'mt' => $unit['mt'],
+                                       'target' => $unit['user'],
+                               );
+                       }
+               }
+
+               if ( $output ) {
+                       return FormatJson::encode( $output, true, 
FormatJson::ALL_OK );
+               } else {
+                       return null;
+               }
+       }
+}
+
+$maintClass = 'CxDump';
+require_once RUN_MAINTENANCE_IF_MAIN;

-- 
To view, visit https://gerrit.wikimedia.org/r/271217
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie3b88c86283a6290ceb95b59b4b4cb7e748ab0ed
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Nikerabbit <niklas.laxst...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to