Gergő Tisza has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/94931


Change subject: Add generic CC license parsing
......................................................................

Add generic CC license parsing

All the possible combinations of license type / version / country
was getting awkward to handle by a fixed whitelist, so it is now
replaced by splitting to license name to parts, and trying to
recognize them.

The disadvantage is that some invalid license names are accepted
(like CC-SA-BY-1.0 or CC-BY-SA-3.0-XX) - this should be of no
practical consequence.

Change-Id: I85af37607db3752b58e577d7ba2a53e91c49113a
---
M CommonsMetadata.php
M CommonsMetadata_body.php
A tests/phpunit/LicenseParserTest.php
3 files changed, 116 insertions(+), 53 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CommonsMetadata 
refs/changes/31/94931/1

diff --git a/CommonsMetadata.php b/CommonsMetadata.php
index 80c41ce..7f161f7 100755
--- a/CommonsMetadata.php
+++ b/CommonsMetadata.php
@@ -30,3 +30,4 @@
 
 $wgHooks['GetExtendedMetadata'][] = 'CommonsMetadata::onGetExtendedMetadata';
 $wgHooks['ValidateExtendedMetadataCache'][] = 
'CommonsMetadata::onValidateExtendedMetadataCache';
+$wgHooks['UnitTestsList'][] = 'CommonsMetadata::onUnitTestsList';
diff --git a/CommonsMetadata_body.php b/CommonsMetadata_body.php
index 8caa875..c72fe21 100755
--- a/CommonsMetadata_body.php
+++ b/CommonsMetadata_body.php
@@ -8,26 +8,14 @@
  */
 class CommonsMetadata {
        /**
-        * Mapping of license category names to message strings used in e.g.
-        * UploadWizard (not yet centralized, big TODO item)
-        *
-        * Lowercase everything before checking against this array.
+        * Nonstandard license name patterns used in 
categories/templates/shortnames
         */
-       static $licenses = array(
-               'cc-by-1.0' => 'cc-by-1.0',
-//             'cc-sa-1.0' => 'cc-sa-1.0', // no shortname
-               'cc-by-sa-1.0' => 'cc-by-sa-1.0',
-               'cc-by-2.0' => 'cc-by-2.0',
-               'cc-by-sa-2.0' => 'cc-by-sa-2.0',
-               'cc-by-2.1' => 'cc-by-2.1',
-               'cc-by-sa-2.1' => 'cc-by-sa-2.1',
-               'cc-by-2.5' => 'cc-by-2.5',
-               'cc-by-sa-2.5' => 'cc-by-sa-2.5',
-               'cc-by-3.0' => 'cc-by-3.0',
-               'cc-by-sa-3.0' => 'cc-by-sa-3.0',
-//             'cc-by-sa-3.0-migrated' => 'cc-by-sa-3.0', // no such shortname
-//             'cc-pd' => 'cc-pd', // no shortname
+       static $licenseAliases = array(
+               'cc-by-sa-3.0-migrated' => 'cc-by-sa-3.0',
+               'cc-pd' => 'cc-pd',
                'cc0' => 'cc-zero',
+               'cc-by-sa-3.0-2.5-2.0-1.0' => 'cc-by-sa-3.0',
+               'cc-by-sa-2.5-2.0-1.0' => 'cc-by-sa-2.5',
        );
 
        /**
@@ -80,10 +68,10 @@
                }
 
                if ( isset( $data['LicenseShortName'] ) ) {
-                       $license = self::getLicenseFromShortname( 
$data['LicenseShortName'] );
-                       if ( $license ) {
+                       $licenseData = self::parseLicenseString( 
$data['LicenseShortName'] );
+                       if ( $licenseData ) {
                                $combinedMeta['License'] = array(
-                                       'value' => $license,
+                                       'value' => $licenseData['name'],
                                        'source' => 'commons-templates',
                                );
                        }
@@ -135,6 +123,12 @@
                return true;
        }
 
+       public static function onUnitTestsList( &$files ) {
+               $testDir = __DIR__ . DIRECTORY_SEPARATOR . 'tests' . 
DIRECTORY_SEPARATOR . 'phpunit';
+               $files = array_merge( $files, glob( $testDir . 
DIRECTORY_SEPARATOR . '*Test.php' ) );
+               return true;
+       }
+
        /**
         * @param File $file
         * @return array list of category names in human-readable format
@@ -151,38 +145,6 @@
        }
 
        /**
-        * Matches category names to a category => license mapping, removes the 
matching categories
-        * and returns the corresponding licenses.
-        * @param array $categories a list of human-readable category names.
-        * @return array
-        * FIXME categories do not work with Commons-hosted images due to bug 
56598
-        */
-       protected static function getLicensesAndRemoveFromCategories( 
&$categories ) {
-               $licenses = array();
-               foreach ( $categories as $i => $category ) {
-                       if ( isset( self::$licenses[$category] ) ) {
-                               $licenses[] = self::$licenses[$category];
-                               unset( $categories[$i] );
-                       }
-               }
-               return array_merge( $licenses ); // renumber to avoid holes in 
array
-       }
-
-       /**
-        * Tries to identify the license based on its short name.
-        * @param string $shortName
-        * @return string|null one of the values from self::$licenses, or null 
if not recognized
-        * @see 
https://commons.wikimedia.org/wiki/Commons:Machine-readable_data#Machine_readable_data_set_by_license_templates
-        */
-       protected static function getLicenseFromShortname( $shortName ) {
-               $shortName = strtolower( trim( $shortName ) );
-               if ( isset( self::$licenses[$shortName] ) ) {
-                       return self::$licenses[$shortName];
-               }
-               return null;
-       }
-
-       /**
         * @param String|boolean $lang Language code or false for all langs.
         *
         * @throws MWException on invalid langcode
@@ -193,6 +155,68 @@
                        throw new MWException( 'Invalid language code 
specified' );
                }
        }
+
+       /**
+        * Takes a license string (could be a category name, template name etc)
+        * and returns template information (or null if the license was not 
recognized).
+        * Only handles CC licenses for now.
+        * The returned array will have the following keys:
+        * - family: e.g. cc, gfdl
+        * - type: e.g. cc-by-sa
+        * - version: e.g. 2.5
+        * - region: e.g. nl
+        * - name: all the above put together, e.g. cc-by-sa-2.5-nl
+        * @param string $str
+        * @return array|null
+        */
+       public static function parseLicenseString( $str ) {
+               $data = array(
+                       'family' => 'cc',
+                       'type' => null,
+                       'version' => null,
+                       'region' => null,
+                       'name' => null,
+               );
+
+               $str = strtolower( trim( $str ) );
+               if ( isset( self::$licenseAliases[$str] ) ) {
+                       $str = self::$licenseAliases[$str];
+               }
+               $parts = explode( '-', $str );
+               if ( $parts[0] != 'cc' ) {
+                       return null;
+               }
+
+               for ( $i = 1; isset( $parts[$i] ) && in_array( $parts[$i], 
array( 'by', 'sa', 'nc', 'nd' ) ); $i++ ) {
+                       if ( in_array( $parts[$i], array( 'nc', 'nd' ) ) ) {
+                               // ignore non-free licenses
+                               return null;
+                       }
+               }
+               $data['type'] = implode( '-', array_slice( $parts, 0, $i ) );
+
+               if ( isset( $parts[$i] ) && is_numeric( $parts[$i] ) ) {
+                       $data['version'] = $parts[$i];
+                       $i++;
+               } else {
+                       return null;
+               }
+
+               if ( isset( $parts[$i] ) && (
+                       preg_match( '/^\w\w$/', $parts[$i] )
+                       || $parts[$i] == 'scotland'
+               ) ) {
+                       $data['region'] = $parts[$i];
+                       $i++;
+               }
+
+               if ( $i != count( $parts ) ) {
+                       return null;
+               }
+
+               $data['name'] = implode( '-', array_filter( array( 
$data['type'], $data['version'], $data['region'] ) ) );
+               return $data;
+       }
 }
 
 /**
diff --git a/tests/phpunit/LicenseParserTest.php 
b/tests/phpunit/LicenseParserTest.php
new file mode 100644
index 0000000..401bce0
--- /dev/null
+++ b/tests/phpunit/LicenseParserTest.php
@@ -0,0 +1,38 @@
+<?php
+/**
+ * Created by PhpStorm.
+ * User: Gergő
+ * Date: 2013.11.12.
+ * Time: 16:56
+ */
+
+class LicenseParserTest extends MediaWikiTestCase {
+       /**
+        * @dataProvider provideLicenseData
+        * @covers CommonsMetadata::parseLicenseString
+        * @group Extensions/CommonsMetadata
+        */
+       public function testParseLicenseString( $str, $family, $type, $version, 
$region, $name ) {
+               $data = CommonsMetadata::parseLicenseString( $str );
+               if ( !$data ) {
+                       $data = array( null, null, null, null, null );
+               }
+               $this->assertArrayEquals( array( $family, $type, $version, 
$region, $name ), $data );
+       }
+
+       public function provideLicenseData() {
+               return array(
+                       array( '', null, null, null, null, null ),
+                       array( 'foo', null, null, null, null, null ),
+                       array( 'cc', null, null, null, null, null ),
+                       array( 'cc-by-sa', null, null, null, null, null ),
+                       array( 'cc-by-sa-nc-3.0', null, null, null, null, null 
),
+                       array( 'cc-by-sa-1.0', 'cc', 'cc-by-sa', '1.0', null, 
'cc-by-sa-1.0' ),
+                       array( 'CC-BY-SA-1.0', 'cc', 'cc-by-sa', '1.0', null, 
'cc-by-sa-1.0' ),
+                       array( 'cc-sa-1.0', 'cc', 'cc-sa', '1.0', null, 
'cc-sa-1.0' ),
+                       array( 'cc-by-2.0-fr', 'cc', 'cc-by', '2.0', 'fr', 
'cc-by-2.0-fr' ),
+                       array( 'cc-by-sa-3.5-scotland', 'cc', 'cc-by-sa', 
'3.5', 'scotland', 'cc-by-sa-3.5-scotland' ),
+                       array( 'cc-by-sa-3.0-foo', null, null, null, null, null 
),
+               );
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/94931
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I85af37607db3752b58e577d7ba2a53e91c49113a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CommonsMetadata
Gerrit-Branch: master
Gerrit-Owner: Gergő Tisza <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to