Hoo man has submitted this change and it was merged.
Change subject: Support multiple compression formats for dumps
......................................................................
Support multiple compression formats for dumps
Turn the compression format into a config variable thereby allowing
both gzip and bzip2 dump files.
This deprecates one i18n message and requires config.json be updated.
Additionally:
* Breaks out i18n for distributions
* Introduces and renames some variables to improve code legibility
* Updates README with this and one older change
Bug: T118397
Change-Id: Ieb517b5ad677abaa541274d69155855b95787f12
---
M DCAT.php
M README.md
M config.example.json
M i18n/en.json
M i18n/qqq.json
5 files changed, 105 insertions(+), 60 deletions(-)
Approvals:
Hoo man: Verified; Looks good to me, approved
diff --git a/DCAT.php b/DCAT.php
index 9904932..8d2bc6d 100644
--- a/DCAT.php
+++ b/DCAT.php
@@ -42,7 +42,9 @@
}
if ( $config['dumps-enabled'] ) {
array_push( $top, "dump-info" );
- $sub["dump-info"] = array( "accessURL", "mediatype", "license"
);
+ $sub["dump-info"] = array(
+ "accessURL", "mediatype", "compression", "license"
+ );
}
// Test
@@ -149,12 +151,12 @@
* @param XmlWriter $xml XML stream to write to
* @param array $data data-blob of i18n and config variables
* @param string|null $dumpDate the date of the dumpfile, null for live data
- * @param string $format the fileformat
+ * @param string $dumpKey the key for the corresponding dump file
*/
-function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate,
$format ) {
+function dumpDistributionExtras( XMLWriter $xml, array $data, $dumpDate,
$dumpKey ) {
$url = str_replace(
'$1',
- $dumpDate . '/' .
$data['dumps'][$dumpDate][$format]['filename'],
+ $dumpDate . '/' .
$data['dumps'][$dumpDate][$dumpKey]['filename'],
$data['config']['dump-info']['accessURL']
);
@@ -169,14 +171,50 @@
$xml->startElementNS( 'dcterms', 'issued', null );
$xml->writeAttributeNS( 'rdf', 'datatype', null,
'http://www.w3.org/2001/XMLSchema#date' );
- $xml->text( $data['dumps'][$dumpDate][$format]['timestamp'] );
+ $xml->text( $data['dumps'][$dumpDate][$dumpKey]['timestamp'] );
$xml->endElement();
$xml->startElementNS( 'dcat', 'byteSize', null );
$xml->writeAttributeNS( 'rdf', 'datatype', null,
'http://www.w3.org/2001/XMLSchema#decimal' );
- $xml->text( $data['dumps'][$dumpDate][$format]['byteSize'] );
+ $xml->text( $data['dumps'][$dumpDate][$dumpKey]['byteSize'] );
$xml->endElement();
+}
+
+/**
+ * Add i18n descriptions for a distribution
+ *
+ * @param XmlWriter $xml XML stream to write to
+ * @param array $data data-blob of i18n and config variables
+ * @param bool $isDump whether this is a dump distribution
+ * @param string $prefix the type of distribution, one of ld, api or dump
+ * @param string $format the file format, if dump
+ * @param string $compression the compression format, if dump
+ */
+function writeDistributionI18n( XMLWriter $xml, array $data, $isDump,
+ $prefix, $format, $compression ) {
+
+ foreach ( $data['i18n'] as $langCode => $langData ) {
+ if ( array_key_exists( "distribution-$prefix-description",
$langData ) ) {
+ $formatDescription =
$langData["distribution-$prefix-description"];
+ if ( $isDump ) {
+ $formatDescription = str_replace(
+ '$1',
+ $format,
+ $formatDescription
+ );
+ $formatDescription = str_replace(
+ '$2',
+ $compression,
+ $formatDescription
+ );
+ }
+ $xml->startElementNS( 'dcterms', 'description', null );
+ $xml->writeAttributeNS( 'xml', 'lang', null, $langCode
);
+ $xml->text( $formatDescription );
+ $xml->endElement();
+ }
+ }
}
/**
@@ -193,56 +231,55 @@
function writeDistribution( XMLWriter $xml, array $data, $distribId, $prefix,
$dumpDate ) {
$ids = array();
+ $isDump = !is_null( $dumpDate );
$allowedMediatypes = $data['config']["$prefix-info"]['mediatype'];
- foreach ( $allowedMediatypes as $format => $mediatype ) {
- // handle missing (and BETA) dump files
- if ( !is_null( $dumpDate ) and !array_key_exists( $format,
$data['dumps'][$dumpDate] ) ) {
- continue;
- }
+ $allowedCompressiontypes = array( '' => '' ); // dummy array for
non-dumps
+ if ( $isDump ) {
+ $allowedCompressiontypes =
$data['config']["$prefix-info"]['compression'];
+ }
- $id = $data['config']['uri'] . '#' . $distribId . $dumpDate .
$format;
- array_push( $ids, $id );
+ foreach ( $allowedCompressiontypes as $compressionName => $compression
) {
+ foreach ( $allowedMediatypes as $format => $mediatype ) {
+ $distributionKey = $format . $compression;
- $xml->startElementNS( 'rdf', 'Description', null );
- $xml->writeAttributeNS( 'rdf', 'about', null, $id );
-
- $xml->startElementNS( 'rdf', 'type', null );
- $xml->writeAttributeNS( 'rdf', 'resource', null,
- 'http://www.w3.org/ns/dcat#Distribution' );
- $xml->endElement();
-
- $xml->startElementNS( 'dcterms', 'license', null );
- $xml->writeAttributeNS( 'rdf', 'resource', null,
- $data['config']["$prefix-info"]['license'] );
- $xml->endElement();
-
- if ( is_null( $dumpDate ) ) {
- $xml->startElementNS( 'dcat', 'accessURL', null );
- $xml->writeAttributeNS( 'rdf', 'resource', null,
- $data['config']["$prefix-info"]['accessURL'] );
- $xml->endElement();
- } else {
- dumpDistributionExtras( $xml, $data, $dumpDate, $format
);
- }
-
- $xml->writeElementNS( 'dcterms', 'format', null, $mediatype );
-
- // add description in each language
- foreach ( $data['i18n'] as $langCode => $langData ) {
- if ( array_key_exists(
"distribution-$prefix-description", $langData ) ) {
- $formatDescription = str_replace(
- '$1',
- $format,
-
$langData["distribution-$prefix-description"]
- );
- $xml->startElementNS( 'dcterms', 'description',
null );
- $xml->writeAttributeNS( 'xml', 'lang', null,
$langCode );
- $xml->text( $formatDescription );
- $xml->endElement();
+ // handle missing (and BETA) dump files
+ if ( $isDump and !array_key_exists( $distributionKey ,
$data['dumps'][$dumpDate] ) ) {
+ continue;
}
- }
- $xml->endElement();
+ $id = $data['config']['uri'] . '#' . $distribId .
$dumpDate . $distributionKey;
+ array_push( $ids, $id );
+
+ $xml->startElementNS( 'rdf', 'Description', null );
+ $xml->writeAttributeNS( 'rdf', 'about', null, $id );
+
+ $xml->startElementNS( 'rdf', 'type', null );
+ $xml->writeAttributeNS( 'rdf', 'resource', null,
+ 'http://www.w3.org/ns/dcat#Distribution' );
+ $xml->endElement();
+
+ $xml->startElementNS( 'dcterms', 'license', null );
+ $xml->writeAttributeNS( 'rdf', 'resource', null,
+ $data['config']["$prefix-info"]['license'] );
+ $xml->endElement();
+
+ if ( !$isDump ) {
+ $xml->startElementNS( 'dcat', 'accessURL', null
);
+ $xml->writeAttributeNS( 'rdf', 'resource', null,
+
$data['config']["$prefix-info"]['accessURL'] );
+ $xml->endElement();
+ } else {
+ dumpDistributionExtras( $xml, $data, $dumpDate,
$distributionKey );
+ }
+
+ $xml->writeElementNS( 'dcterms', 'format', null,
$mediatype );
+
+ // add description in each language
+ writeDistributionI18n( $xml, $data, $isDump, $prefix,
+ $format, $compressionName );
+
+ $xml->endElement();
+ }
}
return $ids;
@@ -584,8 +621,10 @@
*/
function scanDump( $dirname, array $data ) {
$testStrings = array();
- foreach ( $data['config']['dump-info']['mediatype'] as $fileEnding =>
$mediatype ) {
- $testStrings[$fileEnding] = 'all.' . $fileEnding . '.gz';
+ foreach ( $data['config']['dump-info']['compression'] as $compression )
{
+ foreach ( $data['config']['dump-info']['mediatype'] as $format
=> $mediatype ) {
+ $testStrings["$format$compression"] = '-all.' . $format
. '.' . $compression;
+ }
}
$dumps = array();
@@ -594,7 +633,7 @@
foreach ( glob( $dirname . '/[0-9]*', GLOB_ONLYDIR ) as $subdir ) {
// $subdir = testdirNew/20150120
$subDump = array();
- foreach ( glob( $subdir . '/*.gz' ) as $filename ) {
+ foreach ( glob( $subdir . '/*' ) as $filename ) {
// match each file against an expected testString
foreach ( $testStrings as $fileEnding => $testString ) {
if ( substr( $filename, -strlen( $testString )
) === $testString ) {
diff --git a/README.md b/README.md
index f2791bf..76fb017 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
* Content negotiation (various formats)
* MediaWiki api (various formats)
-* Entity dumps e.g. json, ttl (assumes that these are gziped)
+* Entity dumps e.g. json, ttl (assumes that these are compressed)
An example result can be found at [lokal-profil /
dcatap.rdf](https://gist.github.com/lokal-profil/8086dc6bf2398d84a311).
The live DCAT-AP description of Wikidata can be found
[here](https://dumps.wikimedia.org/wikidatawiki/entities/dcatap.rdf).
@@ -102,6 +102,8 @@
* `accessURL`: URL to the directory where the *.json.gz* files
reside (`$1` is replaced on the fly by the actual filename),
e.g. *http://example.org/dumps/$1*
- * `mediatype`: (`object`) List of media types. In practice this is
- always `{"json": "application/json"}` ... for now
+ * `mediatype`: (`object`) List of media types. e.g.
+ `{"json": "application/json"}`
+ * `compression`: (`object`) List of compression formats, in the
+ format *name:file-ending* e.g. `{"gzip": "gz"}`
* `license`: See ld-info:license above
diff --git a/config.example.json b/config.example.json
index e082fa9..bc1e6f9 100644
--- a/config.example.json
+++ b/config.example.json
@@ -45,6 +45,10 @@
"json": "application/json",
"ttl": "text/turtle"
},
+ "compression": {
+ "gzip": "gz",
+ "bzip2": "bz2"
+ },
"license": "http://creativecommons.org/publicdomain/zero/1.0/"
}
}
diff --git a/i18n/en.json b/i18n/en.json
index d767a75..ee81494 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -5,8 +5,8 @@
"dataset-live-title": "Live access",
"dataset-live-description": "The live version of the data, includes
entities and properties. Only non-deprecated formats are listed as
distributions.",
"dataset-dump-title": "Entity dump of $1",
- "dataset-dump-description": "A static dump of all entites for the given
date.",
+ "dataset-dump-description": "A static dump of all entities for the
given date.",
"distribution-ld-description": "The Linked Data endpoint. Format is
resolved through content negotiation.",
"distribution-api-description": "The MediaWiki API endpoint. Format is
given through the \"format\" parameter.",
- "distribution-dump-description": "A gziped $1 file."
+ "distribution-dump-description": "A $1 file, $2 compressed."
}
diff --git a/i18n/qqq.json b/i18n/qqq.json
index c669eb1..12b1dac 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -10,5 +10,5 @@
"dataset-dump-description": "The description of the entity dump for the
given date.",
"distribution-ld-description": "The description of the Linked Data
endpoint. For content negotiation see
https://en.wikipedia.org/wiki/Content_negotiation",
"distribution-api-description": "The description of the MediaWiki API
endpoint. Leave \"format\" untranslated.",
- "distribution-dump-description": "The description of a dump file where
$1 is the file format."
+ "distribution-dump-description": "The description of a dump file where
$1 is the file format and $2 the compression format."
}
--
To view, visit https://gerrit.wikimedia.org/r/262422
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ieb517b5ad677abaa541274d69155855b95787f12
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/dcat
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>
Gerrit-Reviewer: Hoo man <[email protected]>
Gerrit-Reviewer: Lokal Profil <[email protected]>
Gerrit-Reviewer: Nikerabbit <[email protected]>
Gerrit-Reviewer: Raimond Spekking <[email protected]>
Gerrit-Reviewer: Siebrand <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits