Gergő Tisza has uploaded a new change for review.
https://gerrit.wikimedia.org/r/94931
Change subject: Add generic CC license parsing
......................................................................
Add generic CC license parsing
All the possible combinations of license type / version / country
was getting awkward to handle by a fixed whitelist, so it is now
replaced by splitting to license name to parts, and trying to
recognize them.
The disadvantage is that some invalid license names are accepted
(like CC-SA-BY-1.0 or CC-BY-SA-3.0-XX) - this should be of no
practical consequence.
Change-Id: I85af37607db3752b58e577d7ba2a53e91c49113a
---
M CommonsMetadata.php
M CommonsMetadata_body.php
A tests/phpunit/LicenseParserTest.php
3 files changed, 116 insertions(+), 53 deletions(-)
git pull
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CommonsMetadata
refs/changes/31/94931/1
diff --git a/CommonsMetadata.php b/CommonsMetadata.php
index 80c41ce..7f161f7 100755
--- a/CommonsMetadata.php
+++ b/CommonsMetadata.php
@@ -30,3 +30,4 @@
$wgHooks['GetExtendedMetadata'][] = 'CommonsMetadata::onGetExtendedMetadata';
$wgHooks['ValidateExtendedMetadataCache'][] =
'CommonsMetadata::onValidateExtendedMetadataCache';
+$wgHooks['UnitTestsList'][] = 'CommonsMetadata::onUnitTestsList';
diff --git a/CommonsMetadata_body.php b/CommonsMetadata_body.php
index 8caa875..c72fe21 100755
--- a/CommonsMetadata_body.php
+++ b/CommonsMetadata_body.php
@@ -8,26 +8,14 @@
*/
class CommonsMetadata {
/**
- * Mapping of license category names to message strings used in e.g.
- * UploadWizard (not yet centralized, big TODO item)
- *
- * Lowercase everything before checking against this array.
+ * Nonstandard license name patterns used in
categories/templates/shortnames
*/
- static $licenses = array(
- 'cc-by-1.0' => 'cc-by-1.0',
-// 'cc-sa-1.0' => 'cc-sa-1.0', // no shortname
- 'cc-by-sa-1.0' => 'cc-by-sa-1.0',
- 'cc-by-2.0' => 'cc-by-2.0',
- 'cc-by-sa-2.0' => 'cc-by-sa-2.0',
- 'cc-by-2.1' => 'cc-by-2.1',
- 'cc-by-sa-2.1' => 'cc-by-sa-2.1',
- 'cc-by-2.5' => 'cc-by-2.5',
- 'cc-by-sa-2.5' => 'cc-by-sa-2.5',
- 'cc-by-3.0' => 'cc-by-3.0',
- 'cc-by-sa-3.0' => 'cc-by-sa-3.0',
-// 'cc-by-sa-3.0-migrated' => 'cc-by-sa-3.0', // no such shortname
-// 'cc-pd' => 'cc-pd', // no shortname
+ static $licenseAliases = array(
+ 'cc-by-sa-3.0-migrated' => 'cc-by-sa-3.0',
+ 'cc-pd' => 'cc-pd',
'cc0' => 'cc-zero',
+ 'cc-by-sa-3.0-2.5-2.0-1.0' => 'cc-by-sa-3.0',
+ 'cc-by-sa-2.5-2.0-1.0' => 'cc-by-sa-2.5',
);
/**
@@ -80,10 +68,10 @@
}
if ( isset( $data['LicenseShortName'] ) ) {
- $license = self::getLicenseFromShortname(
$data['LicenseShortName'] );
- if ( $license ) {
+ $licenseData = self::parseLicenseString(
$data['LicenseShortName'] );
+ if ( $licenseData ) {
$combinedMeta['License'] = array(
- 'value' => $license,
+ 'value' => $licenseData['name'],
'source' => 'commons-templates',
);
}
@@ -135,6 +123,12 @@
return true;
}
+ public static function onUnitTestsList( &$files ) {
+ $testDir = __DIR__ . DIRECTORY_SEPARATOR . 'tests' .
DIRECTORY_SEPARATOR . 'phpunit';
+ $files = array_merge( $files, glob( $testDir .
DIRECTORY_SEPARATOR . '*Test.php' ) );
+ return true;
+ }
+
/**
* @param File $file
* @return array list of category names in human-readable format
@@ -151,38 +145,6 @@
}
/**
- * Matches category names to a category => license mapping, removes the
matching categories
- * and returns the corresponding licenses.
- * @param array $categories a list of human-readable category names.
- * @return array
- * FIXME categories do not work with Commons-hosted images due to bug
56598
- */
- protected static function getLicensesAndRemoveFromCategories(
&$categories ) {
- $licenses = array();
- foreach ( $categories as $i => $category ) {
- if ( isset( self::$licenses[$category] ) ) {
- $licenses[] = self::$licenses[$category];
- unset( $categories[$i] );
- }
- }
- return array_merge( $licenses ); // renumber to avoid holes in
array
- }
-
- /**
- * Tries to identify the license based on its short name.
- * @param string $shortName
- * @return string|null one of the values from self::$licenses, or null
if not recognized
- * @see
https://commons.wikimedia.org/wiki/Commons:Machine-readable_data#Machine_readable_data_set_by_license_templates
- */
- protected static function getLicenseFromShortname( $shortName ) {
- $shortName = strtolower( trim( $shortName ) );
- if ( isset( self::$licenses[$shortName] ) ) {
- return self::$licenses[$shortName];
- }
- return null;
- }
-
- /**
* @param String|boolean $lang Language code or false for all langs.
*
* @throws MWException on invalid langcode
@@ -193,6 +155,68 @@
throw new MWException( 'Invalid language code
specified' );
}
}
+
+ /**
+ * Takes a license string (could be a category name, template name etc)
+ * and returns template information (or null if the license was not
recognized).
+ * Only handles CC licenses for now.
+ * The returned array will have the following keys:
+ * - family: e.g. cc, gfdl
+ * - type: e.g. cc-by-sa
+ * - version: e.g. 2.5
+ * - region: e.g. nl
+ * - name: all the above put together, e.g. cc-by-sa-2.5-nl
+ * @param string $str
+ * @return array|null
+ */
+ public static function parseLicenseString( $str ) {
+ $data = array(
+ 'family' => 'cc',
+ 'type' => null,
+ 'version' => null,
+ 'region' => null,
+ 'name' => null,
+ );
+
+ $str = strtolower( trim( $str ) );
+ if ( isset( self::$licenseAliases[$str] ) ) {
+ $str = self::$licenseAliases[$str];
+ }
+ $parts = explode( '-', $str );
+ if ( $parts[0] != 'cc' ) {
+ return null;
+ }
+
+ for ( $i = 1; isset( $parts[$i] ) && in_array( $parts[$i],
array( 'by', 'sa', 'nc', 'nd' ) ); $i++ ) {
+ if ( in_array( $parts[$i], array( 'nc', 'nd' ) ) ) {
+ // ignore non-free licenses
+ return null;
+ }
+ }
+ $data['type'] = implode( '-', array_slice( $parts, 0, $i ) );
+
+ if ( isset( $parts[$i] ) && is_numeric( $parts[$i] ) ) {
+ $data['version'] = $parts[$i];
+ $i++;
+ } else {
+ return null;
+ }
+
+ if ( isset( $parts[$i] ) && (
+ preg_match( '/^\w\w$/', $parts[$i] )
+ || $parts[$i] == 'scotland'
+ ) ) {
+ $data['region'] = $parts[$i];
+ $i++;
+ }
+
+ if ( $i != count( $parts ) ) {
+ return null;
+ }
+
+ $data['name'] = implode( '-', array_filter( array(
$data['type'], $data['version'], $data['region'] ) ) );
+ return $data;
+ }
}
/**
diff --git a/tests/phpunit/LicenseParserTest.php
b/tests/phpunit/LicenseParserTest.php
new file mode 100644
index 0000000..401bce0
--- /dev/null
+++ b/tests/phpunit/LicenseParserTest.php
@@ -0,0 +1,38 @@
+<?php
+/**
+ * Created by PhpStorm.
+ * User: Gergő
+ * Date: 2013.11.12.
+ * Time: 16:56
+ */
+
+class LicenseParserTest extends MediaWikiTestCase {
+ /**
+ * @dataProvider provideLicenseData
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testParseLicenseString( $str, $family, $type, $version,
$region, $name ) {
+ $data = CommonsMetadata::parseLicenseString( $str );
+ if ( !$data ) {
+ $data = array( null, null, null, null, null );
+ }
+ $this->assertArrayEquals( array( $family, $type, $version,
$region, $name ), $data );
+ }
+
+ public function provideLicenseData() {
+ return array(
+ array( '', null, null, null, null, null ),
+ array( 'foo', null, null, null, null, null ),
+ array( 'cc', null, null, null, null, null ),
+ array( 'cc-by-sa', null, null, null, null, null ),
+ array( 'cc-by-sa-nc-3.0', null, null, null, null, null
),
+ array( 'cc-by-sa-1.0', 'cc', 'cc-by-sa', '1.0', null,
'cc-by-sa-1.0' ),
+ array( 'CC-BY-SA-1.0', 'cc', 'cc-by-sa', '1.0', null,
'cc-by-sa-1.0' ),
+ array( 'cc-sa-1.0', 'cc', 'cc-sa', '1.0', null,
'cc-sa-1.0' ),
+ array( 'cc-by-2.0-fr', 'cc', 'cc-by', '2.0', 'fr',
'cc-by-2.0-fr' ),
+ array( 'cc-by-sa-3.5-scotland', 'cc', 'cc-by-sa',
'3.5', 'scotland', 'cc-by-sa-3.5-scotland' ),
+ array( 'cc-by-sa-3.0-foo', null, null, null, null, null
),
+ );
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/94931
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I85af37607db3752b58e577d7ba2a53e91c49113a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CommonsMetadata
Gerrit-Branch: master
Gerrit-Owner: Gergő Tisza <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits