Daniel Kinzler has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/235512

Change subject: Check URIs in data values against vocabulary base URI.
......................................................................

Check URIs in data values against vocabulary base URI.

We use URIs as identifiers for calendars, reference globes, and units
of measurement. The vocabulary these URIs come from should be configurable.

Currently, calendars and globes are hardcoded to use Wikidata URIs,
while units use the local repo concepts as the vocabulary.

This change checks incoming data values against the appropriate URI
prefixes.

Bug: 111171
Change-Id: Ib91b1c0a297fd9ab54b0dabf446eb8850e46ac0d
---
M repo/includes/ValidatorBuilders.php
M repo/includes/WikibaseRepo.php
M repo/tests/phpunit/includes/ValidatorBuildersTest.php
3 files changed, 70 insertions(+), 23 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/12/235512/1

diff --git a/repo/includes/ValidatorBuilders.php 
b/repo/includes/ValidatorBuilders.php
index 03a4b7f..69077a9 100644
--- a/repo/includes/ValidatorBuilders.php
+++ b/repo/includes/ValidatorBuilders.php
@@ -37,19 +37,35 @@
 class ValidatorBuilders {
 
        /**
-        * @var EntityIdParser
-        */
-       private $entityIdParser;
-
-       /**
         * @var EntityLookup
         */
        private $entityLookup;
 
        /**
+        * @var EntityIdParser
+        */
+       private $entityIdParser;
+
+       /**
         * @var string[]
         */
        private $urlSchemes;
+
+       /**
+        * @var string The base URI for the vocabulary to use for units (and in 
the
+        * future, globes and calendars).
+        */
+       private $vocabularyBaseUri;
+
+       /**
+        * @var string The base URI wikibase concepts, for use with the 
validators for time and globe
+        * values. Our parsers for these data types currently have Wikidata 
URIs hardcoded, so we need
+        * to hardcode the URI to check them against for now.
+        *
+        * @todo: use a configurable vocabulary for claendards and reference 
globes, instead of
+        * hardcoding wikidata. Then replace usages of $wikidataBaseUri with 
$vocabularyBaseUri.
+        */
+       private $wikidataBaseUri = 'http://www.wikidata.org/entity/';
 
        /**
         *
@@ -62,17 +78,20 @@
         * @param EntityIdParser $idParser
         * @param string[] $urlSchemes
         * @param ContentLanguages $contentLanguages
+        * @param string $vocabularyBaseUri The base URI for vocabulary 
concepts.
         */
        public function __construct(
                EntityLookup $lookup,
                EntityIdParser $idParser,
                array $urlSchemes,
+               $vocabularyBaseUri,
                ContentLanguages $contentLanguages
        ) {
-               $this->contentLanguages = $contentLanguages;
-               $this->entityIdParser = $idParser;
                $this->entityLookup = $lookup;
+               $this->entityIdParser = $idParser;
                $this->urlSchemes = $urlSchemes;
+               $this->vocabularyBaseUri = $vocabularyBaseUri;
+               $this->contentLanguages = $contentLanguages;
        }
 
        /**
@@ -187,7 +206,7 @@
                $validators[] = new TypeValidator( 'array' );
 
                // Expected to be a short IRI, see TimeFormatter and TimeParser.
-               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), 255 );
+               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), $this->wikidataBaseUri, 255 );
                //TODO: enforce well known calendar models from config
 
                $validators[] = new DataFieldValidator( 'calendarmodel', 
$urlValidator );
@@ -235,7 +254,7 @@
                $validators[] = new TypeValidator( 'array' );
 
                // Expected to be a short IRI, see GlobeCoordinateValue and 
GlobeCoordinateParser.
-               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), 255 );
+               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), $this->wikidataBaseUri, 255 );
                //TODO: enforce well known reference globes from config
 
                $validators[] = new DataFieldValidator( 'precision', new 
NumberValidator() );
@@ -251,13 +270,14 @@
 
        /**
         * @param string[] $urlSchemes List of URL schemes, e.g. 'http'
+        * @param string|null $prefix a required prefix
         * @param int $maxLength Defaults to 500 characters. Even if URLs are 
unlimited in theory they
         * should be limited to about 2000. About 500 is a reasonable 
compromise.
-        * @see http://stackoverflow.com/a/417184
         *
         * @return CompositeValidator
+        * @see http://stackoverflow.com/a/417184
         */
-       private function getUrlValidator( $urlSchemes, $maxLength = 500 ) {
+       private function getUrlValidator( array $urlSchemes, $prefix = null, 
$maxLength = 500 ) {
                $validators = array();
                $validators[] = new TypeValidator( 'string' );
                $validators[] = new StringLengthValidator( 2, $maxLength );
@@ -266,7 +286,23 @@
                $urlSchemeValidators = $urlValidators->getValidators( 
$urlSchemes );
                $validators[] = new UrlValidator( $urlSchemeValidators );
 
+               if ( $prefix !== null ) {
+                       $validators[] = $this->getPrefixValidator( $prefix, 
'bad-perefix' );
+               }
+
                return new CompositeValidator( $validators ); //Note: each 
validator is fatal
+       }
+
+       /**
+        * @param string $prefix
+        * @param string $errorCode
+        *
+        * @return RegexValidator
+        */
+       private function getPrefixValidator( $prefix, $errorCode ) {
+               //XXX: we may want to allow http AND https.
+               $regex = '!^' . preg_quote( $prefix, '!' ) . '!';
+               return new RegexValidator( $regex, false, $errorCode );
        }
 
        /**
@@ -295,9 +331,9 @@
                $unitValidators = new AlternativeValidator( array(
                        // NOTE: "1" is always considered legal for historical 
reasons,
                        // since we use it to represent "unitless" quantities. 
We could also use
-                       // http://qudt.org/vocab/unit#Unitless or 
https://www.wikidata.org/entity/Q199
+                       // http://qudt.org/vocab/unit#Unitless or 
http://www.wikidata.org/entity/Q199
                        new MembershipValidator( array( '1' ) ),
-                       $this->getUrlValidator( array( 'http', 'https' ), 255 ),
+                       $this->getUrlValidator( array( 'http', 'https' ), 
$this->vocabularyBaseUri, 255 ),
                ) );
                $validators[] = new DataFieldValidator( 'unit', $unitValidators 
);
 
diff --git a/repo/includes/WikibaseRepo.php b/repo/includes/WikibaseRepo.php
index cfc6e7b..3ad52d9 100644
--- a/repo/includes/WikibaseRepo.php
+++ b/repo/includes/WikibaseRepo.php
@@ -265,6 +265,7 @@
                        $this->getEntityLookup(),
                        $this->getEntityIdParser(),
                        $urlSchemes,
+                       $this->getVocabularyBaseUri(),
                        $this->getMonolingualTextLanguages()
                );
        }
@@ -722,6 +723,15 @@
        }
 
        /**
+        * @return string
+        */
+       private function getVocabularyBaseUri() {
+               //@todo: We currently use the local repo concept URI here. This 
should be configurable,
+               // to e.g. allow 3rd parties to use Wikidata as their vocab 
repo.
+               return $this->getSettings()->getSetting( 'conceptBaseUri' );
+       }
+
+       /**
         * @return OutputFormatSnakFormatterFactory
         */
        protected function newSnakFormatterFactory() {
diff --git a/repo/tests/phpunit/includes/ValidatorBuildersTest.php 
b/repo/tests/phpunit/includes/ValidatorBuildersTest.php
index 669d888..5cb0bd7 100644
--- a/repo/tests/phpunit/includes/ValidatorBuildersTest.php
+++ b/repo/tests/phpunit/includes/ValidatorBuildersTest.php
@@ -54,6 +54,7 @@
                        $entityLookup,
                        $entityIdParser,
                        $urlSchemes,
+                       'http://qudt.org/vocab/',
                        $contentLanguages
                );
 
@@ -114,21 +115,21 @@
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       'http://' . str_repeat( 'x', 256 ) ),
+                                       'http://www.wikidata.org/entity/Q' . 
str_repeat( '6', 256 ) ),
                                false,
                                'calendar: too long'
                        ),
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       'http://acme.com/calendar' ),
+                                       
'http://www.wikidata.org/entity/Q1985727' ),
                                true,
                                'calendar: URL'
                        ),
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       ' http://acme.com/calendar ' ),
+                                       ' 
http://www.wikidata.org/entity/Q1985727 ' ),
                                false,
                                'calendar: untrimmed'
                        ),
@@ -144,14 +145,14 @@
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T11:22:33Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       'http://acme.com/calendar' ),
+                                       
'http://www.wikidata.org/entity/Q1985727' ),
                                false,
                                'time given to the second'
                        ),
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_SECOND,
-                                       'http://acme.com/calendar' ),
+                                       
'http://www.wikidata.org/entity/Q1985727' ),
                                false,
                                'precision: second'
                        ),
@@ -183,7 +184,7 @@
                        ),
                        array(
                                'globe-coordinate',
-                               new GlobeCoordinateValue( $latLonValue, null, 
'http://www.wikdiata.org/entity/Q2' ),
+                               new GlobeCoordinateValue( $latLonValue, null, 
'https://www.wikdiata.org/entity/Q2' ),
                                false,
                                'null precision is invalid'
                        ),
@@ -191,9 +192,9 @@
                        //globe-coordinate[globe]
                        // FIXME: this is testing unimplemented behaviour? 
Probably broken...
                        array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, '' ), false, 'globe: empty string should be invalid' ),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, 'http://' . str_repeat( 'x', 256 ) ), false, 'globe: too long' 
),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, 'http://acme.com/globe' ), true, 'globe: URL' ),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, ' http://acme.com/globe ' ), false, 'globe: untrimmed' ),
+                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, 'http://www.wikidata.org/entity/Q' . str_repeat( '6', 256 ) ), 
false, 'globe: too long' ),
+                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, 'http://www.wikidata.org/entity/Q2' ), true, 'globe: URL' ),
+                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, ' http://www.wikidata.org/entity/Q2 ' ), false, 'globe: 
untrimmed' ),
                        array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, ' javascript:alert(1) ' ), false, 'globe: bad URL scheme' ),
                        //TODO: globe must be an item reference
                        //TODO: globe must be from a list of configured values
@@ -221,7 +222,7 @@
                        //quantity
                        array( 'quantity', QuantityValue::newFromNumber( 5 ), 
true, 'Simple integer' ),
                        array( 'quantity', QuantityValue::newFromNumber( 5, 
'http://qudt.org/vocab/unit#Meter' ), true, 'Vocabulary URI' ),
-                       array( 'quantity', QuantityValue::newFromNumber( 5, 
'https://www.wikidata.org/entity/Q11573' ), true, 'Wikidata URI' ),
+                       array( 'quantity', QuantityValue::newFromNumber( 5, 
'http://www.wikidata.org/entity/Q11573' ), false, 'Wikidata URI' ),
                        array( 'quantity', QuantityValue::newFromNumber( 5, '1' 
), true, '1 means unitless' ),
                        array( 'quantity', QuantityValue::newFromNumber( 5, 
'kittens' ), false, 'Bad unit URI' ),
                        array( 'quantity', QuantityValue::newFromNumber( 
'-11.234', '1', '-10', '-12' ), true, 'decimal strings' ),

-- 
To view, visit https://gerrit.wikimedia.org/r/235512
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib91b1c0a297fd9ab54b0dabf446eb8850e46ac0d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to