Smalyshev has uploaded a new change for review. https://gerrit.wikimedia.org/r/319401
Change subject: Tools for creating unit conversion config ...................................................................... Tools for creating unit conversion config Change-Id: I7845e1fb3b5f862799462428481d63ab8e7afdd6 Bug: T117032 (cherry picked from commit ee990835e4ccf620f4612db82940c7eac98a56b5) --- M composer.json M lib/tests/phpunit/Units/UnitStorageTest.php M lib/tests/phpunit/Units/testunits.json A repo/maintenance/SPARQLClient.php A repo/maintenance/SPARQLException.php A repo/maintenance/updateUnits.php A repo/tests/phpunit/maintenance/UpdateUnitsTest.php 7 files changed, 616 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/01/319401/1 diff --git a/composer.json b/composer.json index e244470..33a8399 100644 --- a/composer.json +++ b/composer.json @@ -26,7 +26,7 @@ "data-values/common": "~0.3.0", "data-values/geo": "~1.0", "data-values/interfaces": "~0.2.0|~0.1.5", - "data-values/number": "~0.7.0", + "data-values/number": "~0.8.0", "data-values/time": "~0.8.4", "data-values/validators": "~0.1.0", "data-values/data-types": "~0.5.0", diff --git a/lib/tests/phpunit/Units/UnitStorageTest.php b/lib/tests/phpunit/Units/UnitStorageTest.php index 6951547..2b78b37 100644 --- a/lib/tests/phpunit/Units/UnitStorageTest.php +++ b/lib/tests/phpunit/Units/UnitStorageTest.php @@ -9,7 +9,7 @@ use RuntimeException; /** - * @covers Wikibase\Lib\UnitConverter + * @covers \Wikibase\Lib\UnitConverter * * @group Wikibase * @@ -46,7 +46,7 @@ $this->assertEquals( [ 'factor' => '22.234', 'unit' => 'Q1' ], $storage->getConversion( 'Q2' ) ); $this->assertEquals( [ 'factor' => '1', 'unit' => 'Q3' ], $storage->getConversion( 'Q3' ) ); - $this->assertEquals( [ 'factor' => '0.0000000000000000000243885945', 'unit' => 'Q3' ], + $this->assertArraySubset( [ 'factor' => '0.0000000000000000000243885945', 'unit' => 'Q3' ], $storage->getConversion( 'Q4' ) ); $this->assertNull( $storage->getConversion( 'Q5' ) ); diff --git a/lib/tests/phpunit/Units/testunits.json b/lib/tests/phpunit/Units/testunits.json index e1cc307..20c191e 100644 --- a/lib/tests/phpunit/Units/testunits.json +++ b/lib/tests/phpunit/Units/testunits.json @@ -13,6 +13,7 @@ ], "Q4": { "factor": "0.0000000000000000000243885945", - "unit": "Q3" + "unit": "Q3", + "otherdata": "should be ignored" } } diff --git a/repo/maintenance/SPARQLClient.php b/repo/maintenance/SPARQLClient.php new file mode 100644 index 0000000..87da2ba --- /dev/null +++ b/repo/maintenance/SPARQLClient.php @@ -0,0 +1,97 @@ +<?php + +namespace Wikibase\Repo\Maintenance; + +use MWHttpRequest; + +/** + * Simple SPARQL client + */ +class SPARQLClient { + + /** + * Query timeout + */ + const TIMEOUT = 300; + + /** + * Construct. + * @param string $url SPARQL Endpoint + * @param string $baseURL RDF base URL - common prefix in entity URIs + */ + public function __construct( $url, $baseURL ) { + $this->endpoint = $url; + $this->baseURL = $baseURL; + } + + /** + * Query SPARQL endpoint + * @param string $sparql query + * @param bool $rawData Whether to return only values or full data objects + * @return array List of results, one row per array element + * Each row will contain fields indexed by variable name. + * @throws SPARQLException + */ + public function query( $sparql, $rawData = false ) { + $url = $this->endpoint . '?' . http_build_query( [ "query" => $sparql, "format" => "json" ] ); + $options = [ 'method' => 'GET', 'timeout' => self::TIMEOUT ]; + $request = MWHttpRequest::factory( $url, $options, __METHOD__ ); + $status = $request->execute(); + if ( !$status->isOK() ) { + throw new SPARQLException( "HTTP error: {$status->getWikiText()}" ); + } + $result = $request->getContent(); + $data = json_decode( $result, true ); + if ( !$data ) { + throw new SPARQLException( "HTTP request failed, response:\n$result" ); + } + + return $this->extractData( $data, $rawData ); + } + + /** + * Get list of IDs satisfying the query + * @param string $sparql query + * @param string $item variable name designating the needed element + * @return string[]|false List of IDs from query + */ + public function getIDs( $sparql, $item ) { + $data = $this->query( $sparql, false ); + if ( $data ) { + return array_map( function ( $row ) use ( $item ) { + return str_replace( $this->baseURL, '', $row[$item] ); + }, $data ); + } + return []; + } + + /** + * Extract data from SPARQL response format + * @param array $data SPARQL result + * @param bool $rawData Whether to return only values or full data objects + * @return array List of results, one row per element. + */ + private function extractData( $data, $rawData = false ) { + $result = []; + if ( $data && !empty( $data['results'] ) ) { + $vars = $data['head']['vars']; + $resrow = []; + foreach ( $data['results']['bindings'] as $row ) { + foreach ( $vars as $var ) { + if ( !isset( $row[$var] ) ) { + $resrow[$var] = null; + continue; + } + if ( $rawData ) { + $resrow[$var] = $row[$var]; + } else { + $resrow[$var] = $row[$var]['value']; + } + } + $result[] = $resrow; + } + } + return $result; + } + +} diff --git a/repo/maintenance/SPARQLException.php b/repo/maintenance/SPARQLException.php new file mode 100644 index 0000000..8ec50ef --- /dev/null +++ b/repo/maintenance/SPARQLException.php @@ -0,0 +1,9 @@ +<?php +namespace Wikibase\Repo\Maintenance; + +/** + * SPARQL exception. + */ +class SPARQLException extends \Exception +{ +} diff --git a/repo/maintenance/updateUnits.php b/repo/maintenance/updateUnits.php new file mode 100644 index 0000000..21eecc4 --- /dev/null +++ b/repo/maintenance/updateUnits.php @@ -0,0 +1,350 @@ +<?php +namespace Wikibase; + +use DataValues\DecimalMath; +use DataValues\DecimalValue; +use Maintenance; +use Wikibase\Repo\Maintenance\SPARQLClient; +use Wikibase\Repo\WikibaseRepo; + +$basePath = + getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..'; +require_once $basePath . '/maintenance/Maintenance.php'; +require_once __DIR__ . '/SPARQLClient.php'; + +/** + * Update the conversion table for units. + * Base unit types for Wikidata: + * Q223662,Q208469 + * SI base unit,SI derived unit + * TODO: add support to non-SI units + * Example run: + * mwscript extensions/WikidataBuildResources/extensions/Wikibase/repo/maintenance/updateUnits.php + * --wiki wikidatawiki --base-unit-types Q223662,Q208469 --base-uri http://www.wikidata.org/entity/ + * --unit-class Q1978718 > unitConversion.json + * @package Wikibase + */ +class UpdateUnits extends Maintenance { + + + /** Base URI + * @var string + */ + private $baseUri; + /** + * Length of the base URI. + * Helper variable to speed up cutting it out. + * @var int + */ + private $baseLen; + /** + * @var SPARQLClient + */ + private $client; + + /** + * Should we silence the error output for tests? + * @var boolean + */ + public $silent; + + public function __construct() { + parent::__construct(); + $this->addDescription( "Update unit conversion table." ); + + $this->addOption( 'base-unit-types', 'Types of base units.', true, true ); + $this->addOption( 'base-uri', 'Base URI for the data.', false, true ); + $this->addOption( 'unit-class', 'Class for units.', false, true ); + $this->addOption( 'format', 'Output format, default is json.', false, true ); + $this->addOption( 'sparql', 'SPARQL endpoint URL.', false, true ); + } + + public function execute() { + if ( !defined( 'WB_VERSION' ) ) { + $this->error( "You need to have Wikibase enabled in order to use this maintenance script!", + 1 ); + } + $format = $this->getOption( 'format', 'json' ); + if ( !is_callable( [ $this, 'format' . $format ] ) ) { + $this->error( "Invalid format", 1 ); + } + + $repo = WikibaseRepo::getDefaultInstance(); + $endPoint = + $this->getOption( 'sparql', $repo->getSettings()->getSetting( 'sparqlEndpoint' ) ); + if ( !$endPoint ) { + $this->error( 'SPARQL endpoint not defined', 1 ); + } + $this->setBaseUri( $this->getOption( 'base-uri', + $repo->getSettings()->getSetting( 'conceptBaseUri' ) ) ); + $this->client = new SPARQLClient( $endPoint, $this->baseUri ); + + $unitClass = $this->getOption( 'unit-class' ); + if ( $unitClass ) { + $filter = "FILTER EXISTS { ?unit wdt:P31/wdt:P279* wd:$unitClass }\n"; + } else { + $filter = ''; + } + + // Get units usage stats. We don't care about units + // That have been used less than 10 times, for now + $unitUsage = $this->getUnitUsage( 10 ); + $baseUnits = $this->getBaseUnits( $filter ); + + $convertUnits = []; + $reconvert = []; + + $convertableUnits = $this->getConvertableUnits( $filter ); + foreach ( $convertableUnits as $unit ) { + $converted = + $this->convertUnit( $unit, $convertUnits, $baseUnits, $unitUsage, $reconvert ); + if ( $converted ) { + $unitName = substr( $unit['unit'], $this->baseLen ); + $convertUnits[$unitName] = $converted; + } + } + + // try to convert some units that reduce to other units + while ( $reconvert ) { + $converted = false; + foreach ( $reconvert as $name => $unit ) { + $convertedUnit = $this->convertDerivedUnit( $unit, $convertUnits ); + if ( $convertedUnit ) { + $convertUnits[$name] = $convertedUnit; + unset( $reconvert[$name] ); + $converted = true; + } + } + // we didn't convert any on this step, no use to continue + // This loop will converge since on each step we will reduce + // the length of $reconvert until we can't do it anymore. + if ( !$converted ) { + break; + } + } + + if ( $reconvert ) { + // still have unconverted units + foreach ( $reconvert as $name => $unit ) { + $this->error( "Weird base unit: {$unit['unit']} reduces to {$unit['siUnit']} which is not base!" ); + } + } + + // Add base units + foreach ( $baseUnits as $base => $baseData ) { + $convertUnits[$base] = [ + 'factor' => "1", + 'unit' => $base, + 'label' => $baseData['unitLabel'], + 'siLabel' => $baseData['unitLabel'] + ]; + } + + $formatter = 'format' . $format; + echo $this->$formatter( $convertUnits ); + } + + /** + * Set base URI + * @param string $uri + */ + public function setBaseUri( $uri ) { + $this->baseUri = $uri; + $this->baseLen = strlen( $uri ); + } + + /** + * Convert unit that does not reduce to a basic unit. + * @param string $unit + * @param array[] $convertUnits List of units already converted + * @return string[]|null Converted data for the unit or null if no conversion possible. + */ + public function convertDerivedUnit( $unit, $convertUnits ) { + if ( isset( $convertUnits[$unit['siUnit']] ) ) { + // we have conversion now + $math = new DecimalMath(); + $newUnit = $convertUnits[$unit['siUnit']]; + $newFactor = + $math->product( new DecimalValue( $unit['si'] ), + new DecimalValue( $newUnit['factor'] ) ); + return [ + 'factor' => trim( $newFactor->getValue(), '+' ), + 'unit' => $newUnit['unit'], + 'label' => $unit['unitLabel'], + 'siLabel' => $newUnit['siLabel'] + ]; + } + return null; + } + + /** + * Create conversion data for a single unit. + * @param string[] $unit Unit data + * @param string[] $convertUnits Already converted data + * @param string[] $baseUnits Base unit list + * @param string[] $unitUsage Unit usage data + * @param string[] &$reconvert Array collecting units that require re-conversion later, + * due to their target unit not being base. + * @return null|\string[] Produces conversion data for the unit or null if not possible. + */ + public function convertUnit( $unit, $convertUnits, $baseUnits, $unitUsage, &$reconvert ) { + $unit['unit'] = substr( $unit['unit'], $this->baseLen ); + $unit['siUnit'] = substr( $unit['siUnit'], $this->baseLen ); + + if ( $unit['unitLabel'][0] == 'Q' ) { + // Skip exotic units that have no English name for now. + // TODO: drop this + $this->error( "Exotic unit: {$unit['unit']} has no English label, skipping for now." ); + return null; + } + + if ( isset( $convertUnits[$unit['unit']] ) ) { + // done already + return null; + } + if ( $unit['unit'] == $unit['siUnit'] ) { + // base unit + if ( $unit['si'] != 1 ) { + $this->error( "Weird unit: {$unit['unit']} is {$unit['si']} of itself!" ); + return null; + } + if ( !isset( $baseUnits[$unit['siUnit']] ) ) { + $this->error( "Weird unit: {$unit['unit']} is self-referring but not base!" ); + return null; + } + } + + if ( !isset( $baseUnits[$unit['unit']] ) && !isset( $unitUsage[$unit['unit']] ) ) { + $this->error( "Low usage unit {$unit['unit']}, skipping..." ); + return null; + } + + if ( !isset( $baseUnits[$unit['siUnit']] ) ) { + // target unit is not actually base + $reconvert[$unit['unit']] = $unit; + } else { + return [ + 'factor' => $unit['si'], + 'unit' => $unit['siUnit'], + // These two are just for humans, not used by actual converter + 'label' => $unit['unitLabel'], + 'siLabel' => $unit['siUnitLabel'] + ]; + } + + return null; + } + + /** + * Format units as JSON + * @param $convertUnits + * @return string + */ + private function formatJSON( $convertUnits ) { + return json_encode( $convertUnits, JSON_PRETTY_PRINT ); + } + + /** + * Get units that are used at least $min times. + * We don't care about units that have been used less than 10 times, for now. + * Only top 200 will be returned (though so far we don't have that many). + * @param int $min Minimal usage for the unit. + * @return string[] Array of ['unit' => Q-id, 'c' => count] + */ + private function getUnitUsage( $min ) { + $usageQuery = <<<UQUERY +SELECT ?unit (COUNT(DISTINCT ?v) as ?c) WHERE { + ?v wikibase:quantityUnit ?unit . + ?s ?p ?v . + FILTER(?unit != wd:Q199) +# Exclude currencies + FILTER NOT EXISTS { ?unit wdt:P31+ wd:Q8142 } +} GROUP BY ?unit + HAVING(?c >= $min) + ORDER BY DESC(?c) + LIMIT 200 +UQUERY; + $unitUsage = $this->client->getIDs( $usageQuery, 'unit' ); + $unitUsage = array_flip( $unitUsage ); + return $unitUsage; + } + + /** + * Get base units + * @param string $filter Unit filter + * @return array + */ + private function getBaseUnits( $filter ) { + $types = + str_replace( [ ',', 'Q' ], [ ' ', 'wd:Q' ], $this->getOption( 'base-unit-types' ) ); + + $baseQuery = <<<QUERY +SELECT ?unit ?unitLabel WHERE { + VALUES ?class { $types } + ?unit wdt:P31 ?class . + $filter + SERVICE wikibase:label { + bd:serviceParam wikibase:language "en" . + } +} +QUERY; + $baseUnitsData = $this->client->query( $baseQuery ); + $baseUnits = []; + // arrange better lookup + foreach ( $baseUnitsData as $base ) { + $item = substr( $base['unit'], strlen( $this->baseUri ) ); + $baseUnits[$item] = $base; + } + return $baseUnits; + } + + /** + * Retrieve the list of convertable units. + * @param $filter + * @return array[]|false List of units that can be converted + */ + private function getConvertableUnits( $filter ) { + $unitsQuery = <<<QUERY +SELECT REDUCED ?unit ?si ?siUnit ?unitLabel ?siUnitLabel WHERE { + ?unit wdt:P31 ?type . + ?type wdt:P279* wd:Q47574 . + # Not a currency + FILTER (?type != wd:Q8142) + # Not a cardinal number + FILTER NOT EXISTS { ?unit wdt:P31 wd:Q163875 } + $filter + # Has conversion to SI Units + ?unit p:P2370/psv:P2370 [ wikibase:quantityAmount ?si; wikibase:quantityUnit ?siUnit ] . + SERVICE wikibase:label { + bd:serviceParam wikibase:language "en" . + } +# Enable this to select only units that are actually used + FILTER EXISTS { [] wikibase:quantityUnit ?unit } +} +QUERY; + return $this->client->query( $unitsQuery ); + } + + /** + * Format units as CSV + * @param $convertUnits + * @return string + */ + private function formatCSV( $convertUnits ) { + $str = ''; + foreach ( $convertUnits as $name => $data ) { + $str .= "$name,$data[0],$data[1]\n"; + } + return $str; + } + + protected function error( $err, $die = 0 ) { + if ( !$this->silent ) { + parent::error( $err, $die ); + } + } + +} + +$maintClass = UpdateUnits::class; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/repo/tests/phpunit/maintenance/UpdateUnitsTest.php b/repo/tests/phpunit/maintenance/UpdateUnitsTest.php new file mode 100644 index 0000000..24bb0df --- /dev/null +++ b/repo/tests/phpunit/maintenance/UpdateUnitsTest.php @@ -0,0 +1,155 @@ +<?php +namespace Wikibase\Test; + +use MediaWikiLangTestCase; +use Wikibase\UpdateUnits; + +/** + * @covers updateUnits.php + * @group Wikibase + */ +class UpdateUnitsTest extends MediaWikiLangTestCase { + + /** + * @var UpdateUnits + */ + private $script; + + public function setUp() { + parent::setUp(); + $this->script = new UpdateUnits(); + $this->script->setBaseUri( 'http://acme.test/' ); + $this->script->silent = true; + } + + public function getUnitCases() { + return [ + 'derived SI unit' => [ + [ + 'unit' => 'http://acme.test/Q2', + 'si' => '123.45', + 'siUnit' => 'http://acme.test/Q1', + 'unitLabel' => 'test unit Q2', + 'siUnitLabel' => 'test unit Q1', + ], + [ + 'factor' => '123.45', + 'unit' => 'Q1', + 'label' => 'test unit Q2', + 'siLabel' => 'test unit Q1', + ] + ], + 'unknown base unit' => [ + [ + 'unit' => 'http://acme.test/Q2', + 'si' => '123.45', + 'siUnit' => 'http://acme.test/Q5', + 'unitLabel' => 'test unit Q2', + 'siUnitLabel' => 'test unit Q5', + ], + null + ], + 'already done' => [ + [ + 'unit' => 'http://acme.test/Q10', + 'si' => '123.45', + 'siUnit' => 'http://acme.test/Q5', + 'unitLabel' => 'test 10', + 'siUnitLabel' => 'test unit Q5', + ], + null + ], + 'weird base unit' => [ + [ + 'unit' => 'http://acme.test/Q1', + 'si' => '123.45', + 'siUnit' => 'http://acme.test/Q1', + 'unitLabel' => 'test 1', + 'siUnitLabel' => 'test unit Q1', + ], + null + ], + 'weird non-base unit' => [ + [ + 'unit' => 'http://acme.test/Q2', + 'si' => '1', + 'siUnit' => 'http://acme.test/Q2', + 'unitLabel' => 'test 2', + 'siUnitLabel' => 'test unit Q2', + ], + null + ], + 'low usage unit' => [ + [ + 'unit' => 'http://acme.test/Q4', + 'si' => '123.45', + 'siUnit' => 'http://acme.test/Q1', + 'unitLabel' => 'test 4', + 'siUnitLabel' => 'test unit Q1', + ], + null + ], + 'reconvertable' => [ + [ + 'unit' => 'http://acme.test/Q3', + 'si' => '123.45', + 'siUnit' => 'http://acme.test/Q2', + 'unitLabel' => 'test', + 'siUnitLabel' => 'test unit Q2', + ], + null + ], + ]; + } + + /** + * @dataProvider getUnitCases + * @param $unit + * @param $expect + */ + public function testConvertUnit( $unit, $expect ) { + $usage = [ 'Q1' => 100, 'Q2' => 50, 'Q3' => 10 ]; + $base = [ 'Q1' => true ]; + $converted = [ 'Q10' => [] ]; + + $reconvert = []; + $converted = $this->script->convertUnit( $unit, $converted, $base, $usage, $reconvert ); + $this->assertEquals( $expect, $converted ); + } + + public function testConvertDerivedUnit() { + $unit = [ + 'unit' => 'http://acme.test/Q3', + 'si' => '67.89', + 'siUnit' => 'http://acme.test/Q2', + 'unitLabel' => 'test unit Q3', + 'siUnitLabel' => 'test unit Q2', + ]; + $usage = [ 'Q1' => 100, 'Q2' => 50, 'Q3' => 10 ]; + $base = [ 'Q1' => true ]; + $converted = [ + 'Q2' => [ + 'factor' => '123.45', + 'unit' => 'Q1', + 'label' => 'test unit Q2', + 'siLabel' => 'test unit Q1', + ] + ]; + $expected = [ + 'factor' => '8381.0205', + 'unit' => 'Q1', + 'label' => 'test unit Q3', + 'siLabel' => 'test unit Q1', + ]; + + $reconvert = []; + $convertedUnit = $this->script->convertUnit( $unit, $converted, $base, $usage, $reconvert ); + $this->assertNull( $convertedUnit ); + $reconverted = reset( $reconvert ); + + $convertedUnit = $this->script->convertDerivedUnit( $reconverted, $converted ); + $this->assertEquals( $expected, $convertedUnit ); + + } + +} -- To view, visit https://gerrit.wikimedia.org/r/319401 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7845e1fb3b5f862799462428481d63ab8e7afdd6 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: wmf/1.28.0-wmf.23 Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits