jenkins-bot has submitted this change and it was merged.

Change subject: Add support for TMX level 1 (plaintext)
......................................................................


Add support for TMX level 1 (plaintext)

I think this is enough and level 2 is not worth it.
JSON format is superior as it contains MT output as well.

Bug: T122042
Change-Id: Ia9f75cf45fc89e3b25054bc367fa47dec60352b3
---
M scripts/dump-corpora.php
1 file changed, 111 insertions(+), 21 deletions(-)

Approvals:
  Santhosh: Looks good to me, approved
  Nikerabbit: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/scripts/dump-corpora.php b/scripts/dump-corpora.php
index 350b199..2399c84 100644
--- a/scripts/dump-corpora.php
+++ b/scripts/dump-corpora.php
@@ -40,7 +40,7 @@
 
                $this->addOption(
                        'format',
-                       '(optional) Dump format. Defaults to JSON.',
+                       '(optional) Dump format. Available formats json 
(default) and tmx.',
                        false, /*required*/
                        true /*has arg*/
                );
@@ -69,6 +69,8 @@
                $split = $this->getOption( 'split-at', false );
                $type = $plain ? 'text' : 'html';
 
+               $formatSpec = array( $format, $type );
+
                $limit = 999999999;
                $offset = 0;
                $translations = Translation::getAllPublishedTranslations(
@@ -83,16 +85,30 @@
                $lookup = new CorporaLookup( $db );
                foreach ( $translations as &$translation ) {
                        $translation['corpora'] = $lookup->getByTranslationId( 
$translation['translationId'] );
-                       if ( $plain ) {
-                               foreach ( $translation['corpora'] as $id => 
$unit ) {
-                                       foreach ( $unit as $field => $value ) {
-                                               if ( !isset( $value['content'] 
) ) {
-                                                       continue;
-                                               }
 
-                                               
$translation['corpora'][$id][$field]['content'] =
-                                                       
Sanitizer::stripAllTags( $value['content'] );
+                       // Some general cleanup
+                       foreach ( $translation['corpora'] as $id => $unit ) {
+                               if ( !isset( $unit['user'] ) ) {
+                                       unset( $translation['corpora'][$id] );
+                                       continue;
+                               }
+
+                               unset( 
$translation['corpora'][$id]['source']['engine'] );
+                               unset( 
$translation['corpora'][$id]['user']['engine'] );
+                       }
+
+                       if ( !$plain ) {
+                               continue;
+                       }
+
+                       foreach ( $translation['corpora'] as $id => $unit ) {
+                               foreach ( $unit as $field => $value ) {
+                                       if ( !isset( $value['content'] ) ) {
+                                               continue;
                                        }
+
+                                       
$translation['corpora'][$id][$field]['content'] =
+                                               Sanitizer::stripAllTags( 
$value['content'] );
                                }
                        }
                }
@@ -101,7 +117,7 @@
                        $source = $sourceLanguage ?: '_';
                        $target = $targetLanguage ?: '_';
                        $filename = 
"cx-corpora.{$source}2{$target}.$type.$format";
-                       $this->export( $format, $filename, $translations );
+                       $this->export( $formatSpec, $filename, $translations, 
$source );
 
                        return;
                }
@@ -114,7 +130,7 @@
                                }
 
                                $filename = 
"cx-corpora.{$sourceLanguage}2{$targetLanguage}.$type.$format";
-                               $this->export( $format, $filename, $targets );
+                               $this->export( $formatSpec, $filename, 
$targets, $sourceLanguage );
                                unset( 
$sorted[$targetLanguage][$sourceLanguage] );
                        }
 
@@ -132,14 +148,14 @@
                        }
 
                        $filename = 
"cx-corpora._2{$targetLanguage}.$type.$format";
-                       $this->export( $format, $filename, $targets );
+                       $this->export( $formatSpec, $filename, $targets, '_' );
                        unset( $sorted[$targetLanguage] );
                }
 
                if ( count( $sorted ) ) {
                        $targets = call_user_func_array( 'array_merge', $sorted 
);
                        $filename = "cx-corpora._2_.$type.$format";
-                       $this->export( $format, $filename, $targets );
+                       $this->export( $formatSpec, $filename, $targets, '_' );
                }
        }
 
@@ -163,27 +179,33 @@
                return $sorted;
        }
 
-       public function export( $format, $filename, array $targets ) {
-               if ( $format !== 'json' ) {
+       public function export( $formatSpec, $filename, array $targets, 
$sourceLanguage ) {
+               $data = null;
+
+               list( $format, $type ) = $formatSpec;
+
+               if ( $format === 'json' ) {
+                       $data = $this->formatJSON( $targets );
+               } elseif ( $format === 'tmx' ) {
+                       $data = $this->formatTMX( $targets, $sourceLanguage, 
$type );
+               } else {
                        $this->error( "Unknown output format\n", 1 );
                }
 
-               $data = $this->formatJSON( $targets );
                if ( $data ) {
                        file_put_contents( $filename, $data );
                        $this->output( "$filename\n" );
                }
        }
 
+       /**
+        * @param array $targets
+        * @return string|null
+        */
        public function formatJSON( array $targets ) {
                $output = array();
                foreach ( $targets as $translation ) {
                        foreach ( $translation['corpora'] as $id => $unit ) {
-                               if ( !isset( $unit['user'] ) ) {
-                                       continue;
-                               }
-
-                               unset( $unit['source']['engine'], 
$unit['user']['engine'] );
                                unset( $unit['source']['timestamp'], 
$unit['user']['timestamp'], $unit['mt']['timestamp'] );
 
                                $globalId = 
"{$translation['translationId']}/$id";
@@ -204,6 +226,74 @@
                        return null;
                }
        }
+
+       /**
+        * @param array $targets
+        * @param string $sourceLanguage Language code.
+        * @param string $type Either html or plain.
+        * @return string|null
+        */
+       public function formatTMX( array $targets, $sourceLanguage, $type ) {
+               if ( $type === 'html' ) {
+                       $this->error( "TMX output format is only supported with 
plaintext\n", 1 );
+               }
+
+               $xml = new DOMDocument( '1.0', 'UTF-8' );
+               $spawn = function ( $tag, array $attributes = array() ) use ( 
$xml ) {
+                       $element = $xml->createElement( $tag );
+                       foreach ( $attributes as $key => $value ) {
+                               $element->setAttribute( $key, $value );
+                       }
+
+                       return $element;
+               };
+
+               $tmx = $spawn( 'tmx', array( 'version' => '1.4' ) );
+               $xml->appendChild( $tmx );
+
+               $header = $spawn( 'header', array(
+                       'creationtool' => 'dump-corpora.php / DOMDocument',
+                       'creationtoolversion' => '1.0.0',
+                       // Could be paragraph, but not guaranteed...
+                       'segtype' => 'block',
+                       'o-tmf' => 'sql',
+                       'adminlang' => 'en',
+                       'sourcelang' => $sourceLanguage === '_' ? '*all*' : 
$sourceLanguage,
+                       'datatype' => 'plaintext',
+               ) );
+               $tmx->appendChild( $header );
+
+               $body = $spawn( 'body' );
+               $tmx->appendChild( $body );
+
+               foreach ( $targets as $translation ) {
+                       foreach ( $translation['corpora'] as $id => $units ) {
+                               $tu = $spawn( 'tu', array( 'srclang' => 
$translation['sourceLanguage'] ) );
+                               $body->appendChild( $tu );
+                               foreach ( $units as $origin => $unit ) {
+                                       if ( $unit['content'] === null ) {
+                                               continue;
+                                       }
+                                       if ( $origin === 'source' ) {
+                                               $tuv = $spawn( 'tuv', array( 
'xml:lang' => $translation['sourceLanguage'] ) );
+                                       } else {
+                                               $tuv = $spawn( 'tuv', array( 
'xml:lang' => $translation['targetLanguage'] ) );
+                                       }
+                                       $tu->appendChild( $tuv );
+                                       $tuvProp = $spawn( 'prop', array( 
'type' => 'origin' ) );
+                                       $tuvProp->appendChild( 
$xml->createTextNode( $origin ) );
+                                       $tuv->appendChild( $tuvProp );
+                                       $tuvSeg = $spawn( 'seg' );
+                                       $tuv->appendChild( $tuvSeg );
+                                       $tuvContent = $xml->createTextNode( 
$unit['content'] );
+                                       $tuvSeg->appendChild( $tuvContent );
+                               }
+                       }
+               }
+               // Format the output with indentation for readability
+               $xml->formatOutput = true;
+               return $xml->saveXML();
+       }
 }
 
 $maintClass = 'CXCorporaDump';

-- 
To view, visit https://gerrit.wikimedia.org/r/272461
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia9f75cf45fc89e3b25054bc367fa47dec60352b3
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/ContentTranslation
Gerrit-Branch: master
Gerrit-Owner: Nikerabbit <[email protected]>
Gerrit-Reviewer: KartikMistry <[email protected]>
Gerrit-Reviewer: Nikerabbit <[email protected]>
Gerrit-Reviewer: Santhosh <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to