jenkins-bot has submitted this change and it was merged.
Change subject: Add generic CC license parsing
......................................................................
Add generic CC license parsing
All the possible combinations of license type / version / country
was getting awkward to handle by a fixed whitelist, so it is now
replaced by splitting to license name to parts, and trying to
recognize them.
The disadvantage is that some invalid license names are accepted
(like CC-SA-BY-1.0 or CC-BY-SA-3.0-XX) - this should be of no
practical consequence.
Change-Id: I85af37607db3752b58e577d7ba2a53e91c49113a
---
M CommonsMetadata.php
M CommonsMetadata_body.php
A tests/phpunit/LicenseParserTest.php
3 files changed, 265 insertions(+), 25 deletions(-)
Approvals:
MarkTraceur: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CommonsMetadata.php b/CommonsMetadata.php
index 80c41ce..7f161f7 100755
--- a/CommonsMetadata.php
+++ b/CommonsMetadata.php
@@ -30,3 +30,4 @@
$wgHooks['GetExtendedMetadata'][] = 'CommonsMetadata::onGetExtendedMetadata';
$wgHooks['ValidateExtendedMetadataCache'][] =
'CommonsMetadata::onValidateExtendedMetadataCache';
+$wgHooks['UnitTestsList'][] = 'CommonsMetadata::onUnitTestsList';
diff --git a/CommonsMetadata_body.php b/CommonsMetadata_body.php
index 843fb73..7b565b5 100755
--- a/CommonsMetadata_body.php
+++ b/CommonsMetadata_body.php
@@ -8,26 +8,15 @@
*/
class CommonsMetadata {
/**
- * Mapping of license category names to message strings used in e.g.
- * UploadWizard (not yet centralized, big TODO item)
- *
- * Lowercase everything before checking against this array.
+ * Nonstandard license name patterns used in
categories/templates/shortnames
*/
- static $licenses = array(
- 'cc-by-1.0' => 'cc-by-1.0',
-// 'cc-sa-1.0' => 'cc-sa-1.0', // no shortname
- 'cc-by-sa-1.0' => 'cc-by-sa-1.0',
- 'cc-by-2.0' => 'cc-by-2.0',
- 'cc-by-sa-2.0' => 'cc-by-sa-2.0',
- 'cc-by-2.1' => 'cc-by-2.1',
- 'cc-by-sa-2.1' => 'cc-by-sa-2.1',
- 'cc-by-2.5' => 'cc-by-2.5',
- 'cc-by-sa-2.5' => 'cc-by-sa-2.5',
- 'cc-by-3.0' => 'cc-by-3.0',
- 'cc-by-sa-3.0' => 'cc-by-sa-3.0',
-// 'cc-by-sa-3.0-migrated' => 'cc-by-sa-3.0', // no such shortname
-// 'cc-pd' => 'cc-pd', // no shortname
- 'cc0' => 'cc-zero',
+ static $licenseAliases = array(
+ 'cc-by-sa-3.0-migrated' => 'cc-by-sa-3.0',
+ 'cc-by-sa-3.0-migrated-with-disclaimers' => 'cc-by-sa-3.0',
+ 'cc-by-sa-3.0-2.5-2.0-1.0' => 'cc-by-sa-3.0',
+ 'cc-by-sa-2.5-2.0-1.0' => 'cc-by-sa-2.5',
+ 'cc-by-2.0-stma' => 'cc-by-2.0',
+ 'cc-by-sa-1.0+' => 'cc-by-sa-3.0',
);
/**
@@ -170,6 +159,17 @@
}
/**
+ * Hook to add unit tests
+ * @param array $files
+ * @return bool
+ */
+ public static function onUnitTestsList( &$files ) {
+ $testDir = __DIR__ . DIRECTORY_SEPARATOR . 'tests' .
DIRECTORY_SEPARATOR . 'phpunit';
+ $files = array_merge( $files, glob( $testDir .
DIRECTORY_SEPARATOR . '*Test.php' ) );
+ return true;
+ }
+
+ /**
* @param LocalFile $file
* @return array list of category names in human-readable format
*/
@@ -195,9 +195,9 @@
protected static function getLicensesAndRemoveFromCategories(
&$categories ) {
$licenses = array();
foreach ( $categories as $i => $category ) {
- $category = strtolower( $category );
- if ( isset( self::$licenses[$category] ) ) {
- $licenses[] = self::$licenses[$category];
+ $licenseData = self::parseLicenseString( $category );
+ if ( $licenseData ) {
+ $licenses[] = $licenseData['name'];
unset( $categories[$i] );
}
}
@@ -214,6 +214,7 @@
protected static function getAssessmentsAndRemoveFromCategories(
&$categories ) {
$assessments = array();
foreach ( $categories as $i => $category ) {
+
foreach ( self::$assessmentCategories as
$assessmentType => $regexp ) {
if ( preg_match( $regexp . 'i', $category ) ) {
$assessments[] = $assessmentType;
@@ -236,10 +237,10 @@
*/
protected static function filterShortnamesAndGetLicense( &$shortNames )
{
foreach ( $shortNames as $name ) {
- $name = strtolower( trim( $name ) );
- if ( isset( self::$licenses[$name] ) ) {
+ $licenseData = self::parseLicenseString( $name );
+ if ( $licenseData ) {
$shortNames = array( $name );
- return self::$licenses[$name];
+ return $licenseData['name'];
}
}
return null;
@@ -256,6 +257,75 @@
throw new MWException( 'Invalid language code
specified' );
}
}
+
+ /**
+ * Takes a license string (could be a category name, template name etc)
+ * and returns template information (or null if the license was not
recognized).
+ * Only handles CC licenses for now.
+ * The returned array will have the following keys:
+ * - family: e.g. cc, gfdl
+ * - type: e.g. cc-by-sa
+ * - version: e.g. 2.5
+ * - region: e.g. nl
+ * - name: all the above put together, e.g. cc-by-sa-2.5-nl
+ * @param string $str
+ * @return array|null
+ */
+ public static function parseLicenseString( $str ) {
+ $data = array(
+ 'family' => 'cc',
+ 'type' => null,
+ 'version' => null,
+ 'region' => null,
+ 'name' => null,
+ );
+
+ $str = strtolower( trim( $str ) );
+ if ( isset( self::$licenseAliases[$str] ) ) {
+ $str = self::$licenseAliases[$str];
+ }
+
+ // some special cases first
+ if ( in_array( $str, array( 'cc0', 'cc-pd' ), true ) ) {
+ $data['type'] = $data['name'] = $str;
+ return $data;
+ }
+
+ $parts = explode( '-', $str );
+ if ( $parts[0] != 'cc' ) {
+ return null;
+ }
+
+ for ( $i = 1; isset( $parts[$i] ) && in_array( $parts[$i],
array( 'by', 'sa', 'nc', 'nd' ) ); $i++ ) {
+ if ( in_array( $parts[$i], array( 'nc', 'nd' ) ) ) {
+ // ignore non-free licenses
+ return null;
+ }
+ }
+ $data['type'] = implode( '-', array_slice( $parts, 0, $i ) );
+
+ if ( isset( $parts[$i] ) && is_numeric( $parts[$i] ) ) {
+ $data['version'] = $parts[$i];
+ $i++;
+ } else {
+ return null;
+ }
+
+ if ( isset( $parts[$i] ) && (
+ preg_match( '/^\w\w$/', $parts[$i] )
+ || $parts[$i] == 'scotland'
+ ) ) {
+ $data['region'] = $parts[$i];
+ $i++;
+ }
+
+ if ( $i != count( $parts ) ) {
+ return null;
+ }
+
+ $data['name'] = implode( '-', array_filter( array(
$data['type'], $data['version'], $data['region'] ) ) );
+ return $data;
+ }
}
/**
diff --git a/tests/phpunit/LicenseParserTest.php
b/tests/phpunit/LicenseParserTest.php
new file mode 100644
index 0000000..a8134c0
--- /dev/null
+++ b/tests/phpunit/LicenseParserTest.php
@@ -0,0 +1,169 @@
+<?php
+/**
+ * Created by PhpStorm.
+ * User: Gergő
+ * Date: 2013.11.12.
+ * Time: 16:56
+ */
+
+class LicenseParserTest extends MediaWikiTestCase {
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testEmptyString() {
+ $licenseString = '';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseIsNotRecognized( $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testTotallyWrongString() {
+ $licenseString = 'foo';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseIsNotRecognized( $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testCCLicenseWithoutVersion() {
+ $licenseString = 'cc-by-sa';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseIsNotRecognized( $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testNonFreeLicense() {
+ $licenseString = 'cc-by-nc-sa-3.0';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseIsNotRecognized( $data );
+
+ $licenseString = 'cc-by-nd-2.1';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseIsNotRecognized( $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testNormalCCLicense() {
+ $licenseString = 'cc-by-sa-1.0';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseFamilyEquals( 'cc', $data );
+ $this->assertLicenseTypeEquals( 'cc-by-sa', $data );
+ $this->assertLicenseVersionEquals( '1.0', $data );
+ $this->assertLicenseRegionEquals( null, $data );
+ $this->assertLicenseNameEquals( 'cc-by-sa-1.0', $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testNormalCCLicenseInUppercase() {
+ $licenseString = 'CC-BY-SA-1.0';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseFamilyEquals( 'cc', $data );
+ $this->assertLicenseTypeEquals( 'cc-by-sa', $data );
+ $this->assertLicenseNameEquals( 'cc-by-sa-1.0', $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testCCSALicense() {
+ $licenseString = 'CC-SA-1.0';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseFamilyEquals( 'cc', $data );
+ $this->assertLicenseTypeEquals( 'cc-sa', $data );
+ $this->assertLicenseNameEquals( 'cc-sa-1.0', $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testRegionalCCLicense() {
+ $licenseString = 'cc-by-2.0-fr';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseTypeEquals( 'cc-by', $data );
+ $this->assertLicenseRegionEquals( 'fr', $data );
+ $this->assertLicenseNameEquals( 'cc-by-2.0-fr', $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testRegionalCCLicenseWithInvalidRegion() {
+ $licenseString = 'cc-by-2.0-foo';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseIsNotRecognized( $data );
+ }
+
+ /**
+ * @covers CommonsMetadata::parseLicenseString
+ * @group Extensions/CommonsMetadata
+ */
+ public function testRegionalCCLicenseWithSpecialRegion() {
+ $licenseString = 'cc-by-2.0-scotland';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseTypeEquals( 'cc-by', $data );
+ $this->assertLicenseRegionEquals( 'scotland', $data );
+ $this->assertLicenseNameEquals( 'cc-by-2.0-scotland', $data );
+ }
+
+ public function testCC0() {
+ $licenseString = 'CC0';
+ $data = CommonsMetadata::parseLicenseString( $licenseString );
+ $this->assertLicenseTypeEquals( 'cc0', $data );
+ $this->assertLicenseVersionEquals( null, $data );
+ $this->assertLicenseNameEquals( 'cc0', $data );
+ }
+
+ /**********************************************************************/
+
+ protected function assertLicenseIsRecognized( $licenseData ) {
+ $this->assertNotNull( $licenseData );
+ }
+
+ protected function assertLicenseIsNotRecognized( $licenseData ) {
+ $this->assertNull( $licenseData );
+ }
+
+ protected function assertLicenseElementEquals( $expected, $element,
$licenseData ) {
+ $this->assertInternalType( 'array', $licenseData );
+ $this->assertArrayHasKey( $element, $licenseData );
+ $this->assertEquals( $expected, $licenseData[$element] );
+ }
+
+ protected function assertLicenseFamilyEquals( $family, $licenseData ) {
+ $this->assertLicenseElementEquals( $family, 'family',
$licenseData );
+ }
+
+ protected function assertLicenseTypeEquals( $type, $licenseData ) {
+ $this->assertLicenseElementEquals( $type, 'type', $licenseData
);
+ }
+
+ protected function assertLicenseVersionEquals( $version, $licenseData )
{
+ $this->assertLicenseElementEquals( $version, 'version',
$licenseData );
+ }
+
+ protected function assertLicenseRegionEquals( $region, $licenseData ) {
+ $this->assertLicenseElementEquals( $region, 'region',
$licenseData );
+ }
+
+ protected function assertLicenseNameEquals( $name, $licenseData ) {
+ $this->assertLicenseElementEquals( $name, 'name', $licenseData
);
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/94931
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I85af37607db3752b58e577d7ba2a53e91c49113a
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/CommonsMetadata
Gerrit-Branch: master
Gerrit-Owner: Gergő Tisza <[email protected]>
Gerrit-Reviewer: Gergő Tisza <[email protected]>
Gerrit-Reviewer: MarkTraceur <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits