Hello Ottomata, I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/189981 to review the following change. Change subject: Add parser for media file urls ...................................................................... Add parser for media file urls Change-Id: I1b76e1e331ea781aee13557fc55a2c19ce5744a7 --- M changelog.md A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java A refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java A refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java 8 files changed, 1,655 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/81/189981/1 diff --git a/changelog.md b/changelog.md index 76ac26a..5733c29 100644 --- a/changelog.md +++ b/changelog.md @@ -1,6 +1,7 @@ ## v0.0.6-SNAPSHOT * Add custom percent en-/decoders to ease URL normalization. * Add Referer classifier +* Add parser for media file urls ## v0.0.5 * For geocoding, allow to specify the MaxMind databases that should get used. diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java new file mode 100644 index 0000000..ab972f9 --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java @@ -0,0 +1,132 @@ +package org.wikimedia.analytics.refinery.core; + +public class MediaFileUrlInfo { + + public enum Classification { + UNKNOWN, + ORIGINAL, + TRANSCODED_TO_AUDIO, + TRANSCODED_TO_IMAGE, + TRANSCODED_TO_MOVIE, + } + + private String baseName; + private Classification classification; + private Integer width; + private Integer height; + + public static MediaFileUrlInfo createUnknown() { + return new MediaFileUrlInfo(null, Classification.UNKNOWN, + null, null); + } + + public static MediaFileUrlInfo createOriginal(final String baseName) { + return new MediaFileUrlInfo(baseName, Classification.ORIGINAL, + null, null); + } + + public static MediaFileUrlInfo createTranscodedToImage( + final String baseName, final Integer width) { + return new MediaFileUrlInfo(baseName, + Classification.TRANSCODED_TO_IMAGE, width, null); + } + + public static MediaFileUrlInfo createTranscodedToMovie( + final String baseName, final int height) { + return new MediaFileUrlInfo(baseName, + Classification.TRANSCODED_TO_MOVIE, null, height); + } + + public static MediaFileUrlInfo createTranscodedToAudio( + final String baseName) { + return new MediaFileUrlInfo(baseName, + Classification.TRANSCODED_TO_AUDIO, null, null); + } + + private MediaFileUrlInfo(final String baseName, + final Classification quality, final Integer width, + final Integer height) { + this.baseName = baseName; + this.classification = quality; + this.width = width; + this.height = height; + } + + public String getBaseName() { + return baseName; + } + + public Classification getClassification() { + return classification; + } + + public Integer getWidth() { + return width; + } + + public Integer getHeight() { + return height; + } + + @Override + public boolean equals(final Object obj) { + boolean ret = false; + + if (obj instanceof MediaFileUrlInfo) { + MediaFileUrlInfo other = + (MediaFileUrlInfo) obj; + + ret = true; + + ret &= classification == other.classification; + + if (baseName == null) { + ret &= other.baseName == null; + } else { + ret &= baseName.equals(other.baseName); + } + + if (width == null) { + ret &= other.width == null; + } else { + ret &= width.equals(other.width); + } + + if (height == null) { + ret &= other.height == null; + } else { + ret &= height.equals(other.height); + } + } + + return ret; + } + + @Override + public String toString() { + String ret = "MediaFileUrlInfo["; + switch (classification) { + case UNKNOWN: + ret += "unknown"; + break; + case ORIGINAL: + ret += baseName; + ret += ", original"; + break; + case TRANSCODED_TO_AUDIO: + ret += baseName; + ret += ", transcoded to audio"; + break; + case TRANSCODED_TO_IMAGE: + ret += baseName; + ret += ", transcoded to image, width: " + width; + break; + case TRANSCODED_TO_MOVIE: + ret += baseName; + ret += ", transcoded to movie, height: " + height; + break; + } + ret += "]"; + return ret; + } +} diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java new file mode 100644 index 0000000..0574f96 --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java @@ -0,0 +1,262 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringUtils; + +/** + * Parse an url for MediaFileUrlInfo + */ +public class MediaFileUrlParser { + /** + * Pattern to match wikis within other patterns + */ + private static Pattern wikiPattern = Pattern.compile("[a-z_-]{2,}[0-9]*"); + + /** + * Pattern to match math urls + */ + private static Pattern mathPattern = Pattern.compile( + "/math" + + "/([0-9-a-f])" + + "/([0-9-a-f])" + + "/([0-9-a-f])" + + "/\\1\\2\\3[0-9-a-f]{29}\\.png"); + + private static Pattern mathPerWikiPattern = Pattern.compile( + "/[^/]*/" + wikiPattern.pattern() + mathPattern.pattern()); + + /** + * Pattern to match score urls + */ + private static Pattern scorePattern = Pattern.compile( + "(/score" + + "/([0-9a-z])" + + "/([0-9a-z])" + + "/(\\2\\3[0-9a-z]{6})[0-9a-z]{23}/\\4\\.)((png)|(ogg|midi))"); + + /** + * Pattern to match timeline urls + */ + private static Pattern timelinePattern = Pattern.compile( + "/[^/]*/" + wikiPattern.pattern() + "/timeline/[0-9-a-f]{32}\\.png"); + + /** + * Pattern to match urls for plain uploaded media files + */ + private static Pattern uploadedPattern = Pattern.compile( + "(/[^/]*/" + wikiPattern.pattern() + ")" // group 1: project + + "(?:/(thumb|transcoded))?" // group 2: Markers for transcodings + + "(/archive|/temp)?" // group 3: Needed to construct basename + + "(/([0-9-a-f])/\\5[0-9-a-f])" // groups 4+5: Hash. Needed for backref, and to construct basename + + "/(?:([12][0-9]{3}[01][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-6][0-9])(?:!|%21))?" // group 6: timestamp + + "(([^/]*?)(?:\\.[^./]*)?)" // group 7: the main file name + // group 8: them main file name without suffix (such as ".png") + + "(/" // group 9: the whole transcoding spec + + "(?:lossy-)?" // If transcoding is marked lossy (Like a single page of a tiff -> jpeg) + + "(?:lossless-)?" // If transcoding is marked lossless (Like a single page of a tiff -> png) + + "(?:page[0-9]+-)?" // For single page transcodings of a multi-page original (Like tiff -> png, pdf-> png) + + "(?:lang[a-z-]*-)?" // Rendering only a single language of a multi-language original (Like svg -> png) + + "(?:(?:qlow|mid)-)?" // Quality markers with undefined width/height + + "(?:0*([1-9]+[0-9]*)px-)?" // group 10: Thumbnail pixel width (like 120px) + + "(?:seek(?:=|%3D)[0-9]+(?:\\.[0-9]*)?)?" // Seeking to a timestamp in a video (When transcoding movies to images) + + "-?" // This is the "-" that separates the prepended options from the thumbnail name. + + "(?:\\7|\\8|thumbnail(?:\\.(?:djvu|ogv|pdf|svg|tiff?))?)" // main thumbnail name + + "(?:\\.0*([1-9][0-9]*)p)?" // group 11: Transcoding height + + "(?:\\.(?:" // Ending of transcoded output format for: + + "(ogg)" // group 12: audio files + + "|(gif|jpe?g|png)" // group 13: images + + "|(webm|ogv)" // group 14: movies + + "))" + + ")?"); + + /** + * Parses a string of digits to a bounded Integer, possibly null + * <p/> + * If the string of digits it too to fit in an Integer, the maximum possible + * Integer is returned. + * + * @param digits The string of digits to parse integer + * @return Integer null, if str is null. Otherwise a Integer in + * [0, Integer.MAX_VALUE] + */ + private static Integer parseDigitString(String digits) { + Integer ret = null; + if (digits != null) { + try { + ret = Integer.parseInt(digits); + } catch (NumberFormatException e) { + // Since digits is required to be a string of digits, the only way a NumberFormatException can be thrown is that the number is too big. Hence, we bound the the maximum possible integer. + ret = Integer.MAX_VALUE; + } + } + return ret; + } + + /** + * Parses information out of a url for media files in the upload domain + * + * @param url The url to parse + * @return IdentifyMediaFileUrlInfo holding the parsed data. + * null if parsing failed. + */ + public static MediaFileUrlInfo parse(String url) { + final MediaFileUrlInfo ret; + + if (url == null) { + return null; + } + + String uriPath; + + if (url.startsWith("http://upload.wikimedia.org/")) { + uriPath = url.substring(27); + } else if (url.startsWith("https://upload.wikimedia.org/")) { + uriPath = url.substring(28); + } else if (url.startsWith("/")) { + uriPath = url; + } else { + return null; + } + + // url was either protocol- and domain-less, or it is valid for upload. + assert uriPath.startsWith("/") : "uriPath does not start in \"/\", but is " + uriPath; + + uriPath = PercentDecoder.decode(uriPath); + uriPath = uriPath.replaceAll("//+", "/"); + uriPath = uriPath.trim(); + + String[] uriPathParts = StringUtils.split(uriPath, '/'); + + assert uriPathParts != null : "Split gave null array"; + + if (uriPathParts.length < 1) { + return null; + } + + switch (uriPathParts[0]) { + case "math": + if (mathPattern.matcher(uriPath).matches()){ + ret = MediaFileUrlInfo.createOriginal(uriPath); + } else { + return null; + }; + break; + case "score": + Matcher matcher = scorePattern.matcher(uriPath); + if (matcher.matches()) { + String baseName = matcher.group(1) + "png"; + if (matcher.group(6) != null) { + ret = MediaFileUrlInfo.createTranscodedToImage( + baseName, null); + } else if (matcher.group(7) != null) { + ret = MediaFileUrlInfo.createTranscodedToAudio(baseName); + } else { + throw new AssertionError("Logic error due to score having " + + "both group 6 and 7 empty ('" + uriPath + "')"); + } + } else { + return null; + }; + break; + case "wikibooks": + case "wikinews": + case "wikimedia": + case "wikipedia": + case "wikiquote": + case "wikisource": + case "wikiversity": + case "wikivoyage": + case "wiktionary": + Matcher imageMatcher = uploadedPattern.matcher(uriPath); + if (imageMatcher.matches()) { + String project = imageMatcher.group(1); + String transcoding = imageMatcher.group(2); + String timestampFlag = imageMatcher.group(3); + String hash = imageMatcher.group(4); + // No group 5, as that group holds the first hexadecimal digit. + String timestamp = imageMatcher.group(6); + String file = imageMatcher.group(7); + // No group 8, as that group holds the file without suffix. + String transcodingSpec = imageMatcher.group(9); + String widthStr = imageMatcher.group(10); + String heightStr = imageMatcher.group(11); + String transcodedAudioSuffix = imageMatcher.group(12); + String transcodedImageSuffix = imageMatcher.group(13); + String transcodedMovieSuffix = imageMatcher.group(14); + + // Setting basename + final String baseName; + if (timestampFlag == null) { + baseName = project + hash + '/' + file; + } else if ("/archive".equals(timestampFlag)) { + baseName = project + timestampFlag + hash + '/' + + timestamp + '!' + file; + } else if ("/temp".equals(timestampFlag)) { + // Note that the timestamp is matched within the file, so + // no need to add the timestamp here. + baseName = project + timestampFlag + hash + '/' + file; + } else { + throw new AssertionError("Logic error due to timestampFlag" + + " '" + timestampFlag + "' not being handled"); + } + + if (transcoding == null && transcodingSpec == null) { + ret = MediaFileUrlInfo.createOriginal(baseName); + } else if (transcoding != null && transcodingSpec != null) { + if ("thumb".equals(transcoding) + || (transcodedImageSuffix != null)) { + Integer width = parseDigitString(widthStr); + ret = MediaFileUrlInfo.createTranscodedToImage(baseName, width); + } else if (transcodedAudioSuffix != null) { + ret = MediaFileUrlInfo.createTranscodedToAudio(baseName); + } else if (transcodedMovieSuffix != null) { + Integer height = parseDigitString(heightStr); + ret = MediaFileUrlInfo.createTranscodedToMovie(baseName, height); + } else { + throw new AssertionError("Logic error due to" + + "transcodingSpec without a suffix specific handler '" + + transcodingSpec + "'"); + } + } else { + return null; + } + } else if (timelinePattern.matcher(uriPath).matches()) { + ret = MediaFileUrlInfo.createOriginal(uriPath); + } else if (mathPerWikiPattern.matcher(uriPath).matches()) { + ret = MediaFileUrlInfo.createOriginal(uriPath); + } else { + return null; + }; + break; + case "favicon.ico": + if (("/" + uriPathParts[0]).equals(uriPath)) { + ret = MediaFileUrlInfo.createOriginal(uriPath); + } else { + return null; + } + break; + default: + return null; + } + + assert (ret != null) : "Logic error, as info is still not set"; + + return ret; + } +} diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java new file mode 100644 index 0000000..fe7833a --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java @@ -0,0 +1,323 @@ +package org.wikimedia.analytics.refinery.core; + +import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo.Classification; + +import junit.framework.TestCase; + +public class TestMediaFileUrlInfo extends TestCase { + + public void assertEquals(String message, int expected, Integer actual) { + assertEquals(message, new Integer(expected), actual); + } + + public void assertContainsIgnoreCase(String hayStack, String needle) { + assertTrue("'" + hayStack + "' does not contain '" + needle + "'", + hayStack.toLowerCase().contains(needle.toLowerCase())); + } + + // Factory methods -------------------------------------------------------- + + public void testCreateUnknown() { + MediaFileUrlInfo info = MediaFileUrlInfo.createUnknown(); + + assertNull("Base name not null", info.getBaseName()); + assertEquals("Classification does not match", + Classification.UNKNOWN, info.getClassification()); + assertNull("Width not null", info.getWidth()); + assertNull("Height not null", info.getHeight()); + } + + public void testCreateOriginal() { + MediaFileUrlInfo info = MediaFileUrlInfo.createOriginal("foo"); + + assertEquals("Base name does not match", "foo", info.getBaseName()); + assertEquals("Classification does not match", + Classification.ORIGINAL, info.getClassification()); + assertNull("Width not null for unknown", info.getWidth()); + assertNull("Height not null for unknown", info.getHeight()); + } + + public void testCreateTranscodedToAudio() { + MediaFileUrlInfo info = MediaFileUrlInfo.createTranscodedToAudio("foo"); + + assertEquals("Base name does not match", "foo", info.getBaseName()); + assertEquals("Classification does not match", + Classification.TRANSCODED_TO_AUDIO, info.getClassification()); + assertNull("Width not null for unknown", info.getWidth()); + assertNull("Height not null for unknown", info.getHeight()); + } + + public void testCreateTranscodedToImage() { + MediaFileUrlInfo info = MediaFileUrlInfo.createTranscodedToImage("foo", 42); + + assertEquals("Base name does not match", "foo", info.getBaseName()); + assertEquals("Classification does not match", + Classification.TRANSCODED_TO_IMAGE, info.getClassification()); + assertEquals("Width does not match", 42, info.getWidth()); + assertNull("Height not null", info.getHeight()); + } + + public void testCreateTranscodedToMovie() { + MediaFileUrlInfo info = MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + + assertEquals("Base name does not match", "foo", info.getBaseName()); + assertEquals("Classification does not match", + Classification.TRANSCODED_TO_MOVIE, info.getClassification()); + assertNull("Width not null", info.getWidth()); + assertEquals("Height does not match", 42, info.getHeight()); + } + + // Instance methods ------------------------------------------------------- + + // Equals ................................................................. + + public void testUnknowEqualsNull() { + MediaFileUrlInfo info = MediaFileUrlInfo.createUnknown(); + + boolean actual = info.equals(null); + + assertFalse("Proper instance equals null", actual); + } + + public void testUnknownEqualsSame() { + MediaFileUrlInfo info = MediaFileUrlInfo.createUnknown(); + + boolean actual = info.equals(info); + + assertTrue("Proper instance does not equal itself", actual); + } + + public void testUnknownEqualsUnknown() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createUnknown(); + MediaFileUrlInfo infoB = MediaFileUrlInfo.createUnknown(); + + boolean actual = infoA.equals(infoB); + + assertTrue("Two unknown not equal", actual); + } + + public void testOriginalEqualsUnknown() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo"); + MediaFileUrlInfo infoB = MediaFileUrlInfo.createUnknown(); + + boolean actual = infoA.equals(infoB); + + assertFalse("Original and unknown equal", actual); + } + + public void testOriginalEqualsOriginal() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo"); + + MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("foo"); + + boolean actual = infoA.equals(infoB); + + assertTrue("Originals not equal", actual); + } + + public void testOriginalEqualsOriginalDifferentBaseName() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo"); + + MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("bar"); + + boolean actual = infoA.equals(infoB); + + assertFalse("Base name does not disambiguate", actual); + } + + public void testOriginalEqualsOriginalBaseNameOtherNull() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo"); + + MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal(null); + + boolean actual = infoA.equals(infoB); + + assertFalse("BaseName does not disambiguate", actual); + } + + public void testOriginalEqualsOriginalBaseNameThisNull() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal(null); + + MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("foo"); + + boolean actual = infoA.equals(infoB); + + assertFalse("BaseName does not disambiguate", actual); + } + + public void testAudioEqualsAudio() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToAudio("foo"); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToAudio("foo"); + + boolean actual = infoA.equals(infoB); + + assertTrue("Audio files to not equal", actual); + } + + public void testAudioEqualsAudioDifferentBaseName() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToAudio("foo"); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToAudio("bar"); + + boolean actual = infoA.equals(infoB); + + assertFalse("Base name does not disambiguate", actual); + } + + public void testAudioEqualsOriginal() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToAudio("foo"); + MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("foo"); + + boolean actual = infoA.equals(infoB); + + assertFalse("Audio and original equal", actual); + } + + public void testImageEqualsImage() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToImage("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToImage("foo", 42); + + boolean actual = infoA.equals(infoB); + + assertTrue("Images not equal", actual); + } + + public void testImageEqualsAudio() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToImage("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToAudio("foo"); + + boolean actual = infoA.equals(infoB); + + assertFalse("Image and audio equal", actual); + } + + public void testImageEqualsImageDifferentBaseName() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToImage("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToImage("bar", 42); + + boolean actual = infoA.equals(infoB); + + assertFalse("Base name does not disambiguate", actual); + } + + public void testImageEqualsImageDifferentWidth() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToImage("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToImage("foo", 43); + + boolean actual = infoA.equals(infoB); + + assertFalse("Width does not disambiguate", actual); + } + + public void testMovieEqualsImage() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToImage("foo", 42); + + boolean actual = infoA.equals(infoB); + + assertFalse("Movie and image equal", actual); + } + + public void testMovieEqualsMovie() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + + boolean actual = infoA.equals(infoB); + + assertTrue("Movies not equal", actual); + } + + public void testMovieEqualsMovieDifferentBaseName() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToMovie("bar", 42); + + boolean actual = infoA.equals(infoB); + + assertFalse("Base name does not disambiguate", actual); + } + + public void testMovieEqualsMovieDifferentHeight() { + MediaFileUrlInfo infoA = + MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToMovie("foo", 43); + + boolean actual = infoA.equals(infoB); + + assertFalse("Height does not disambiguate", actual); + } + + public void testUnknownEqualsMovie() { + MediaFileUrlInfo infoA = MediaFileUrlInfo.createUnknown(); + MediaFileUrlInfo infoB = + MediaFileUrlInfo.createTranscodedToMovie("foo", 42); + + boolean actual = infoA.equals(infoB); + + assertFalse("Unknown and movie equal", actual); + } + + // toString ............................................................... + + public void testUnknown() { + String stringRep = MediaFileUrlInfo.createUnknown().toString(); + + assertContainsIgnoreCase(stringRep, "unknown"); + } + + public void testOriginal() { + String stringRep = MediaFileUrlInfo.createOriginal("foo").toString(); + + assertContainsIgnoreCase(stringRep, "original"); + assertContainsIgnoreCase(stringRep, "foo"); + } + + public void testAudio() { + String stringRep = MediaFileUrlInfo.createTranscodedToAudio("foo").toString(); + + assertContainsIgnoreCase(stringRep, "audio"); + assertContainsIgnoreCase(stringRep, "foo"); + } + + public void testImageWidthNonNull() { + String stringRep = MediaFileUrlInfo.createTranscodedToImage("foo", 42).toString(); + + assertContainsIgnoreCase(stringRep, "image"); + assertContainsIgnoreCase(stringRep, "foo"); + assertContainsIgnoreCase(stringRep, "42"); + } + + public void testImageWidthNull() { + String stringRep = MediaFileUrlInfo.createTranscodedToImage("foo", null).toString(); + + assertContainsIgnoreCase(stringRep, "image"); + assertContainsIgnoreCase(stringRep, "foo"); + assertContainsIgnoreCase(stringRep, "null"); + } + + public void testMovie() { + String stringRep = MediaFileUrlInfo.createTranscodedToMovie("foo", 42).toString(); + + assertContainsIgnoreCase(stringRep, "movie"); + assertContainsIgnoreCase(stringRep, "foo"); + assertContainsIgnoreCase(stringRep, "42"); + } +} diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java new file mode 100644 index 0000000..dd9e216 --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java @@ -0,0 +1,615 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import junit.framework.TestCase; + +public class TestMediaFileUrlParser extends TestCase { + + // Helper methods --------------------------------------------------------- + + private void assertParsed(final String url, + final MediaFileUrlInfo expected) { + MediaFileUrlInfo actual = MediaFileUrlParser.parse(url); + + assertEquals("Parsed info does not equal expected", expected, actual); + } + + private void assertUnidentified(final String url) { + assertParsed(url, null); + } + + private void assertOriginal(final String url, + final String baseName) { + assertParsed(url, MediaFileUrlInfo.createOriginal(baseName)); + } + + private void assertOriginal(final String url) { + assertOriginal(url, url); + } + + private void assertImage(final String url, + final String baseName, final Integer width) { + assertParsed(url, + MediaFileUrlInfo.createTranscodedToImage(baseName, width)); + } + + private void assertMovie(final String url, + final String baseName, final int height) { + assertParsed(url, + MediaFileUrlInfo.createTranscodedToMovie(baseName, height)); + } + + private void assertAudio(final String url, final String baseName) { + assertParsed(url, MediaFileUrlInfo.createTranscodedToAudio(baseName)); + } + + // Test degenerate settings ----------------------------------------------- + + public void testNull() { + assertUnidentified(null); + } + + public void testEmpty() { + assertUnidentified(""); + } + + public void testPlainSlash() { + assertUnidentified("/"); + } + + public void testLongPixelStringLowResolution() { + assertImage( + "/wikipedia/commons/thumb/8/83/Kit_body.svg/00000000000000000000000000000000000000000000000000000000000000000000000000000000000000001px-Kit_body.svg.png", + "/wikipedia/commons/8/83/Kit_body.svg", 1); + } + + public void testLongPixelStringHighResolution() { + assertImage( + "/wikipedia/commons/thumb/8/83/Kit_body.svg/10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000px-Kit_body.svg.png", + "/wikipedia/commons/8/83/Kit_body.svg", + Integer.MAX_VALUE); + } + + // Test protocols --------------------------------------------------------- + + public void testNoProtocolNoLeadingSlash() { + assertUnidentified("math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testNoProtocolLeadingSlash() { + assertOriginal("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testHttp() { + assertOriginal( + "http://upload.wikimedia.org/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png", + "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testHttpPlainSlash() { + assertUnidentified("http://upload.wikimedia.org/"); + } + + public void testHttps() { + assertOriginal( + "https://upload.wikimedia.org/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png", + "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testHttpsPlainSlash() { + assertUnidentified("https://upload.wikimedia.org/"); + } + + // Test uri cleanup ------------------------------------------------------- + + public void testSlashTrimming() { + assertOriginal( + "/math/d/a///9//da9d325123d50dbc4e36363f2863ce3e.png", + "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testDecodingThumbNoneEncoded() { + assertImage( + "/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg/240px-Japan_on_the_globe_(claimed)_(Japan_centered).svg.png", + "/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg", + 240); + } + + public void testDecodingThumbBothEncoded() { + assertImage( + "/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg/240px-Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg.png", + "/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg", + 240); + } + + public void testDecodingThumbOnlyMainPartEncoded() { + assertImage( + "/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg/240px-Japan_on_the_globe_(claimed)_(Japan_centered).svg.png", + "/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg", + 240); + } + + public void testDecodingThumbOnlyThumbEncoded() { + assertImage( + "/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg/240px-Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg.png", + "/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg", + 240); + } + + public void testTrimming() { + assertImage( + "/wikipedia/commons/thumb/a/ae/Essig-1.jpg/459px-Essig-1.jpg%20%20%20", + "/wikipedia/commons/a/ae/Essig-1.jpg", 459); + } + + // Test static assets ----------------------------------------------------- + + public void testFavicon() { + assertOriginal("/favicon.ico"); + } + + public void testFaviconWithPathlessSuffix() { + assertUnidentified("/favicon.icofoo"); + } + + public void testFaviconWithPathedSuffix() { + assertUnidentified("/favicon.ico/foo"); + } + + // Test math images ------------------------------------------------------- + + public void testMathPlain() { + assertOriginal("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathNonHexFirstUrlPart() { + assertUnidentified("/math/X/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathNonHexSecondUrlPart() { + assertUnidentified("/math/d/Y/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathNonHexThirdUrlPart() { + assertUnidentified("/math/d/a/Z/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathNonHexHash() { + assertUnidentified("/math/d/a/9/da9d32512Qd50dbc4e36363f2863ce3e.png"); + } + + public void testMathHashTooLong() { + assertUnidentified("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e0.png"); + } + + public void testMathHashTooShort() { + assertUnidentified("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3.png"); + } + + public void testMathFirstHashDigitMismatch() { + assertUnidentified("/math/0/a/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathSecondHashDigitMismatch() { + assertUnidentified("/math/d/0/9/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathThirdHashDigitMismatch() { + assertUnidentified("/math/d/a/0/da9d325123d50dbc4e36363f2863ce3e.png"); + } + + public void testMathPerWikiPlain() { + assertOriginal("/wikipedia/en/math/5/6/a/56a5d0fae0136327e61476dcfe43109a.png"); + } + + // Test score ------------------------------------------------------------- + + public void testScore() { + assertImage( + "/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png", + "/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png", null); + } + + public void testScoreNonAlphaNumFirstPart() { + assertUnidentified("/score/-/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png"); + } + + public void testScoreNonAlphaNumSecondPart() { + assertUnidentified("/score/7/-/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png"); + } + + public void testScoreNonAlphaNumThirdPart() { + assertUnidentified("/score/7/a/7ae-9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png"); + } + + public void testScoreNonAlphaNumFourthPart() { + assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jw-.png"); + } + + public void testScoreNonMatchingFirstPart() { + assertUnidentified("/score/8/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png"); + } + + public void testScoreNonMatchingSecondPart() { + assertUnidentified("/score/7/b/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png"); + } + + public void testScoreNonMatchingThirdPart() { + assertUnidentified("/score/7/a/7aeg9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png"); + } + + public void testScoreNonMatchingFourthPart() { + assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem0jww.png"); + } + + public void testScoreTooLongThirdPart() { + assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2bc/7aem9jww.png"); + } + + public void testScoreTooLongFourthPart() { + assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jwwi.png"); + } + + public void testScoreOgg() { + assertAudio("/score/q/0/q0bopydzemuz315z4n6dvg8sfu8qsu0/q0bopydz.ogg", + "/score/q/0/q0bopydzemuz315z4n6dvg8sfu8qsu0/q0bopydz.png"); + } + + public void testScoreMidi() { + assertAudio("/score/k/7/k7yj1lvc3fqecbmknn497haqj6x9g2y/k7yj1lvc.midi", + "/score/k/7/k7yj1lvc3fqecbmknn497haqj6x9g2y/k7yj1lvc.png"); + } + + // Test timeline image ---------------------------------------------------- + + public void testTimeline() { + assertOriginal("/wikipedia/en/timeline/12435a102adebdee9059bc97bb652af1.png"); + } + + // Test uploaded media files ---------------------------------------------- + + public void testMediaMeta() { + assertOriginal("/wikipedia/meta/7/74/Wikibooks-logo_sister_1x.png"); + } + + public void testMediaNonHexFirstPart() { + assertUnidentified("/wikipedia/meta/X/74/Wikibooks-logo_sister_1x.png"); + } + + public void testMediaNonHexSecondPart() { + assertUnidentified("/wikipedia/meta/7/7X/Wikibooks-logo_sister_1x.png"); + } + + public void testMediaFirstAndSecondPartMismatch() { + assertUnidentified("/wikipedia/meta/7/84/Wikibooks-logo_sister_1x.png"); + } + + public void testMediaCommons() { + assertOriginal("/wikipedia/commons/d/dd/Fumiyuki_Beppu_Giro_2011.jpg"); + } + + public void testMediaWikibooks() { + assertOriginal("/wikibooks/en/b/bc/Wiki.png"); + } + + public void testMediaWiktionary() { + assertOriginal("/wiktionary/fr/b/bc/Wiki.png"); + } + + public void testMediaWikinews() { + assertOriginal("/wikinews/en/f/f7/Twitter.png"); + } + + public void testMediaWikiquote() { + assertOriginal("/wikiquote/en/b/bc/Wiki.png"); + } + + public void testMediaWikisource() { + assertOriginal("/wikisource/ar/d/dd/Foo.pdf"); + } + + public void testMediaWikiversity() { + assertOriginal("/wikiversity/ru/b/b6/Diffuziya_v_menzurke.jpg"); + } + + public void testMediaWikivoyage() { + assertOriginal("/wikivoyage/ru/c/ce/Map_mag.png"); + } + + public void testMediaWikimedia() { + assertImage( + "/wikimedia/pl/thumb/4/47/Spraw_2010_OPP.pdf/page21-180px-Spraw_2010_OPP.pdf.jpg", + "/wikimedia/pl/4/47/Spraw_2010_OPP.pdf", 180); + } + + public void testMediaWikimania2014() { + assertImage( + "/wikipedia/wikimania2014/thumb/a/ae/Rufus_Pollock.png/293px-Rufus_Pollock.png", + "/wikipedia/wikimania2014/a/ae/Rufus_Pollock.png", 293); + } + + // Test uploaded media files; Thumbs -------------------------------------- + + public void testMediaThumbLowQuality() { + assertImage( + "/wikipedia/it/thumb/0/0d/Venosa-Stemma.png/50px-Venosa-Stemma.png", + "/wikipedia/it/0/0d/Venosa-Stemma.png", 50); + } + + public void testMediaThumbHighQuality() { + assertImage( + "/wikipedia/commons/thumb/0/01/USS_Texas_BB-35_aircastle.jpg/1024px-USS_Texas_BB-35_aircastle.jpg", + "/wikipedia/commons/0/01/USS_Texas_BB-35_aircastle.jpg", 1024); + } + + public void testMediaThumbGif() { + assertImage( + "/wikipedia/ar/thumb/c/c1/Logo_of_the_African_Union.png/60px-Logo_of_the_African_Union.png.gif", + "/wikipedia/ar/c/c1/Logo_of_the_African_Union.png", 60); + } + + public void testMediaThumbPngJpeg() { + assertImage( + "/wikipedia/ru/thumb/2/29/MagicDepartment.png/240px-MagicDepartment.png.jpeg", + "/wikipedia/ru/2/29/MagicDepartment.png", 240); + } + + public void testMediaThumbDjvu() { + assertImage( + "/wikipedia/commons/thumb/b/b5/foo.djvu/page1-800px-thumbnail.djvu.jpg", + "/wikipedia/commons/b/b5/foo.djvu", 800); + } + + public void testMediaThumbSvgPng() { + assertImage( + "/wikipedia/commons/thumb/a/ae/Flag_of_the_United_Kingdom.svg/24px-Flag_of_the_United_Kingdom.svg.png", + "/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg", 24); + } + + public void testMovieThumbWithoutFormatEnding() { + assertImage( + "/wikipedia/commons/thumb/9/9f/Chicago_-_State_St_at_Madison_Ave%2C_1897.ogv/180px-Chicago_-_State_St_at_Madison_Ave%2C_1897.ogv", + "/wikipedia/commons/9/9f/Chicago_-_State_St_at_Madison_Ave,_1897.ogv", + 180); + } + + // Test uploaded media files; Specialities -------------------------------- + + public void testMediaThumbQLow() { + assertImage( + "/wikipedia/commons/thumb/8/8c/Google_Mountain_View_campus_garden.jpg/qlow-330px-Google_Mountain_View_campus_garden.jpg", + "/wikipedia/commons/8/8c/Google_Mountain_View_campus_garden.jpg", + 330); + } + + + public void testMediaThumbMid() { + assertImage( + "/wikipedia/commons/thumb/7/7d/Will_Success_Spoil_Rock_Hunter_trailer.ogv/mid-Will_Success_Spoil_Rock_Hunter_trailer.ogv.jpg", + "/wikipedia/commons/7/7d/Will_Success_Spoil_Rock_Hunter_trailer.ogv", + null); + } + + public void testMediaSeek() { + assertImage( + "/wikipedia/commons/thumb/3/3d/Suez_nationalization.ogv/seek%3D151-Suez_nationalization.ogv.jpg", + "/wikipedia/commons/3/3d/Suez_nationalization.ogv", + null); + } + + public void testMediaSeekWithResolution() { + assertImage( + "/wikipedia/commons/thumb/3/3d/Suez_nationalization.ogv/1000px-seek%3D151-Suez_nationalization.ogv.jpg", + "/wikipedia/commons/3/3d/Suez_nationalization.ogv", 1000); + } + + public void testMediaThumbWithoutNameDuplication() { + assertImage( + "/wikipedia/commons/thumb/8/8c/Google_Mountain_View_campus_garden.jpg/330px-thumbnail.jpg", + "/wikipedia/commons/8/8c/Google_Mountain_View_campus_garden.jpg", + 330); + } + + public void testMediaThumbPdfWithoutNameDuplication() { + assertImage( + "/wikipedia/commons/thumb/2/21/Quetzalcatl_-_Divindade_adorada_pelos_Asteca_Tolteca_e_Maias_quem_teria_no_s_originado_os_homens_como_tambm_providenciado_seu_principal_alimento_o_milho.pdf/page1-220px-thumbnail.pdf.jpg", + "/wikipedia/commons/2/21/Quetzalcatl_-_Divindade_adorada_pelos_Asteca_Tolteca_e_Maias_quem_teria_no_s_originado_os_homens_como_tambm_providenciado_seu_principal_alimento_o_milho.pdf", + 220); + } + + public void testMediaThumbTifWithoutNameDuplication() { + assertImage( + "/wikipedia/commons/thumb/7/72/EXTERIOR_DETAIL_VIEW_OF_THE_UMBRA_FROM_THE_SOUTH_-_Mark_Twain_House_351_Farmington_Avenue_corrected_from_original_address_of_531_Farmington_Avenue_Hartford_Hartford_HABS_CONN-HARF16-30.tif/lossy-page1-120px-thumbnail.tif.jpg", + "/wikipedia/commons/7/72/EXTERIOR_DETAIL_VIEW_OF_THE_UMBRA_FROM_THE_SOUTH_-_Mark_Twain_House_351_Farmington_Avenue_corrected_from_original_address_of_531_Farmington_Avenue_Hartford_Hartford_HABS_CONN-HARF16-30.tif", + 120); + } + + public void testMediaThumbSvgWithoutNameDuplication() { + assertImage( + "/wikipedia/commons/thumb/6/6e/ABS-6457.0-InternationalTradePriceIndexesAustralia-ExportPriceIndexBySitcIndexNumbersPercentageChanges-IndexNumbers-ManufacturedGoodsClassifiedChieflyByMaterial6-A2295543A.svg/300px-thumbnail.svg.png", + "/wikipedia/commons/6/6e/ABS-6457.0-InternationalTradePriceIndexesAustralia-ExportPriceIndexBySitcIndexNumbersPercentageChanges-IndexNumbers-ManufacturedGoodsClassifiedChieflyByMaterial6-A2295543A.svg", + 300); + } + + public void testMediaThumbOgvWithoutNameDuplication() { + assertImage( + "/wikipedia/commons/thumb/e/e8/Putin_talk_2011-12-15_00695-00810_....ogv/250px--thumbnail.ogv.jpg", + "/wikipedia/commons/e/e8/Putin_talk_2011-12-15_00695-00810_....ogv", + 250); + } + + public void testMediaThumbTiffWithoutNameDuplication() { + assertImage( + "/wikipedia/commons/thumb/a/a1/Queens__Vol._2..._NYPL1693954.tiff/lossy-page1-120px-thumbnail.tiff.jpg", + "/wikipedia/commons/a/a1/Queens__Vol._2..._NYPL1693954.tiff", + 120); + } + + public void testMediaThumbLangFr() { + assertImage( + "/wikipedia/commons/thumb/8/85/Defaut.svg/langfr-250px-Defaut.svg.png", + "/wikipedia/commons/8/85/Defaut.svg", 250); + } + + public void testMediaThumbLangFrHighResolution() { + assertImage( + "/wikipedia/commons/thumb/8/85/Defaut.svg/langfr-2500px-Defaut.svg.png", + "/wikipedia/commons/8/85/Defaut.svg", 2500); + } + + public void testMediaThumbLangPl() { + assertImage( + "/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langpl-400px-First_Ionization_Energy.svg.png", + "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 400); + } + + public void testMediaThumbLangPlHighResolution() { + assertImage( + "/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langpl-4000px-First_Ionization_Energy.svg.png", + "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 4000); + } + + public void testMediaThumbLangZhHans() { + assertImage( + "/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langzh-hans-400px-First_Ionization_Energy.svg.png", + "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 400); + } + + public void testMediaThumbLangZhHansHighResolution() { + assertImage( + "/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langzh-hans-4000px-First_Ionization_Energy.svg.png", + "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 4000); + } + + public void testMediaThumbLangUpperCase() { + assertUnidentified("/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langXr-400px-First_Ionization_Energy.svg.png"); + } + + public void testMediaThumbLangNumber() { + assertUnidentified("/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/lang7-400px-First_Ionization_Energy.svg.png"); + } + + public void testMediaThumbLangNonALpha() { + assertUnidentified("/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/lang?-400px-First_Ionization_Energy.svg.png"); + } + + public void testMediaThumbPaged() { + assertImage( + "/wikipedia/commons/thumb/6/6a/DiagFuncMacroSyst.pdf/page1-450px-DiagFuncMacroSyst.pdf.jpg", + "/wikipedia/commons/6/6a/DiagFuncMacroSyst.pdf", 450); + } + + public void testMediaThumbPagedLossy() { + assertImage( + "/wikipedia/commons/thumb/0/02/1969_Afghanistan_Sistan_wind_ripples.tiff/lossy-page1-220px-1969_Afghanistan_Sistan_wind_ripples.tiff.jpg", + "/wikipedia/commons/0/02/1969_Afghanistan_Sistan_wind_ripples.tiff", + 220); + } + + public void testMediaThumbPagedLossless() { + assertImage( + "/wikipedia/commons/thumb/b/b2/KeyCard.tiff/lossless-page1-220px-KeyCard.tiff.png", + "/wikipedia/commons/b/b2/KeyCard.tiff", 220); + } + + public void testMediaThumbDoubleDash() { + assertImage("http://upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Reichelt.ogg/220px--Reichelt.ogg.jpg", + "/wikipedia/commons/1/1a/Reichelt.ogg", 220); + } + + public void testMediaThumbSeekInteger() { + assertImage( + "/wikipedia/commons/thumb/d/df/Emu_feeding_on_grass.ogg/220px-seek%3D43-Emu_feeding_on_grass.ogg.jpg", + "/wikipedia/commons/d/df/Emu_feeding_on_grass.ogg", 220); + } + + public void testMediaThumbSeekFraction() { + assertImage( + "/wikipedia/commons/thumb/a/af/Pelecanus_occidentalis_-Jamaica_-fishing-8.ogv/220px-seek%3D2.5-Pelecanus_occidentalis_-Jamaica_-fishing-8.ogv.jpg", + "/wikipedia/commons/a/af/Pelecanus_occidentalis_-Jamaica_-fishing-8.ogv", + 220); + } + + public void testMediaThumbTemp() { + assertImage( + "/wikipedia/commons/thumb/temp/5/57/20141026110003%21phphHKAQ2.jpg/100px-20141026110003%21phphHKAQ2.jpg", + "/wikipedia/commons/temp/5/57/20141026110003!phphHKAQ2.jpg", + 100); + } + + // Test uploaded media files; Archive ------------------------------------- + + public void testMediaArchive() { + assertOriginal( + "/wikipedia/sr/archive/2/25/20121208204804!ZvezdeGranda.jpg", + "/wikipedia/sr/archive/2/25/20121208204804!ZvezdeGranda.jpg"); + } + + public void testMediaArchiveEncoded() { + assertOriginal( + "/wikipedia/sr/archive/2/25/20121208204804%21ZvezdeGranda.jpg", + "/wikipedia/sr/archive/2/25/20121208204804!ZvezdeGranda.jpg"); + } + + public void testMediaThumbArchive() { + assertImage( + "/wikipedia/commons/thumb/archive/b/bb/20100220202202!Polska_1386_-_1434.png/120px-Polska_1386_-_1434.png", + "/wikipedia/commons/archive/b/bb/20100220202202!Polska_1386_-_1434.png", + 120); + } + + public void testMediaThumbArchiveEncoded() { + assertImage( + "/wikipedia/commons/thumb/archive/b/bb/20100220202202%21Polska_1386_-_1434.png/120px-Polska_1386_-_1434.png", + "/wikipedia/commons/archive/b/bb/20100220202202!Polska_1386_-_1434.png", + 120); + } + + // Test uploaded media files; Transcoded ---------------------------------- + + public void testMediaTranscodedWebM() { + assertMovie( + "/wikipedia/commons/transcoded/3/31/Lheure_du_foo.ogv/Lheure_du_foo.ogv.360p.webm", + "/wikipedia/commons/3/31/Lheure_du_foo.ogv", 360); + } + + public void testMediaTranscodedWebMHighResolution() { + assertMovie( + "/wikipedia/commons/transcoded/3/31/Lheure_du_foo.ogv/Lheure_du_foo.ogv.480p.webm", + "/wikipedia/commons/3/31/Lheure_du_foo.ogv", + 480); + } + + public void testMediaTranscodedOgv() { + assertMovie( + "/wikipedia/commons/transcoded/d/d3/Yvonne_Strahovski_about_her_acting_career.webm/Yvonne_Strahovski_about_her_acting_career.webm.360p.ogv", + "/wikipedia/commons/d/d3/Yvonne_Strahovski_about_her_acting_career.webm", + 360); + } + + public void testMediaTranscodedOgvHighResolution() { + assertMovie( + "/wikipedia/commons/transcoded/d/d3/Yvonne_Strahovski_about_her_acting_career.webm/Yvonne_Strahovski_about_her_acting_career.webm.480p.ogv", + "/wikipedia/commons/d/d3/Yvonne_Strahovski_about_her_acting_career.webm", + 480); + } + + public void testMediaTranscodedOgg() { + assertAudio( + "/wikipedia/commons/transcoded/b/bd/Xylophone_jingle.wav/Xylophone_jingle.wav.ogg", + "/wikipedia/commons/b/bd/Xylophone_jingle.wav"); + } +} diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java index 733cb20..9bd5905 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java @@ -22,7 +22,7 @@ /** * A Hive UDF to identify what requests constitute "pageviews", - * according to the definition at + * according to the definition at * https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql * This is the "legacy" definition, in use by WebStatsCollector and the * pageviews dumps at http://dumps.wikimedia.org/other/pagecounts-ez/ diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java new file mode 100644 index 0000000..88e3a50 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java @@ -0,0 +1,165 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.wikimedia.analytics.refinery.core.MediaFileUrlParser; +import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo; +import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo.Classification; +import org.wikimedia.analytics.refinery.core.PercentEncoder; + +import java.util.LinkedList; +import java.util.List; + +// "deterministic" is the default anyways, but we want to make it visible, +// hence we explicitly set it.: +@UDFType(deterministic = true) +@Description(name = "parse_media_file_url", + value = "_FUNC_(url) - Returns a map of details to a media file url", + extended = "argument 0 is the url to analyze") +public class MediaFileUrlParserUDF extends GenericUDF { + private Object[] result; + + private StringObjectInspector inputOI; + + private int IDX_BASE_NAME; + private int IDX_IS_ORIGINAL; + private int IDX_IS_TRANSCODED_AUDIO; + private int IDX_IS_TRANSCODED_IMAGE; + private int IDX_IS_TRANSCODED_MOVIE; + private int IDX_WIDTH; + private int IDX_HEIGHT; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) + throws UDFArgumentException { + // We need exactly 1 parameter + if (arguments == null || arguments.length != 1) { + throw new UDFArgumentLengthException("The function " + + "ParseMediaFileUrlUDF expects exactly 1 parameter"); + } + + // ... and the parameter has to be a string + if (!(arguments[0] instanceof StringObjectInspector)) { + throw new UDFArgumentTypeException(0, "The parameter to " + + "ParseMediaFileUrlUDF has to be a string"); + } + + inputOI = (StringObjectInspector) arguments[0]; + + List<String> fieldNames = new LinkedList<String>(); + List<ObjectInspector> fieldOIs= new LinkedList<ObjectInspector>(); + int idx = 0; + + fieldNames.add("base_name"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + IDX_BASE_NAME=idx++; + + fieldNames.add("is_original"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + IDX_IS_ORIGINAL=idx++; + + fieldNames.add("is_transcoded_to_audio"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + IDX_IS_TRANSCODED_AUDIO=idx++; + + fieldNames.add("is_transcoded_to_image"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + IDX_IS_TRANSCODED_IMAGE=idx++; + + fieldNames.add("is_transcoded_to_movie"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector); + IDX_IS_TRANSCODED_MOVIE=idx++; + + fieldNames.add("width"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + IDX_WIDTH=idx++; + + fieldNames.add("height"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + IDX_HEIGHT=idx++; + + result = new Object[idx]; + + return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + assert arguments != null : "Method 'evaluate' of ParseMediaFileUrlUDF " + + "called with null arguments array"; + assert arguments.length == 1 : "Method 'evaluate' of " + + "ParseMediaFileUrlUDF called arguments of length " + + arguments.length + " (instead of 1)"; + // arguments is an array with exactly 1 entry. + + assert result != null : "Result object has not yet been initialized, " + + "but evaluate called"; + // result object has been initialized. So it's an array of objects of + // the right length. + + String url = inputOI.getPrimitiveJavaObject(arguments[0].get()); + + MediaFileUrlInfo info = MediaFileUrlParser.parse(url); + + if (info == null) { + result[IDX_BASE_NAME] = null; + + result[IDX_IS_ORIGINAL] = false; + result[IDX_IS_TRANSCODED_AUDIO] = false; + result[IDX_IS_TRANSCODED_IMAGE] = false; + result[IDX_IS_TRANSCODED_MOVIE] = false; + + result[IDX_WIDTH] = null; + result[IDX_HEIGHT] = null; + } else { + result[IDX_BASE_NAME] = PercentEncoder.encode(info.getBaseName()); + + Classification classification = info.getClassification(); + result[IDX_IS_ORIGINAL] = (classification == Classification.ORIGINAL); + result[IDX_IS_TRANSCODED_AUDIO] = (classification == Classification.TRANSCODED_TO_AUDIO); + result[IDX_IS_TRANSCODED_IMAGE] = (classification == Classification.TRANSCODED_TO_IMAGE); + result[IDX_IS_TRANSCODED_MOVIE] = (classification == Classification.TRANSCODED_TO_MOVIE); + + result[IDX_WIDTH] = info.getWidth(); + result[IDX_HEIGHT] = info.getHeight(); + } + + return result; + } + + @Override + public String getDisplayString(String[] arguments) { + String argument; + if (arguments == null) { + argument = "<arguments == null>"; + } else if (arguments.length == 1) { + argument = arguments[0]; + } else { + argument = "<arguments of length " + arguments.length + ">"; + } + return "parse_media_file_url(" + argument +")"; + } +} diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java new file mode 100644 index 0000000..8a6694e --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java @@ -0,0 +1,156 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.hive; + +import java.io.IOException; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo.Classification; + +import junit.framework.TestCase; + +public class TestMediaFileUrlParserUDF extends TestCase { + ObjectInspector StringOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector LongOI = PrimitiveObjectInspectorFactory.javaLongObjectInspector; + + private Object callUDF(String url) throws HiveException, IOException { + DeferredObject urlDO = new DeferredJavaObject(url); + DeferredObject[] arguments = new DeferredObject[] {urlDO}; + Object res = null; + + MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF(); + try { + udf.initialize(new ObjectInspector[]{StringOI}); + res = udf.evaluate(arguments); + } finally { + udf.close(); + } + return res; + } + + private void assertOutput(String url, String baseName, + Classification classification, Integer width, Integer height) + throws HiveException, IOException { + Object[] res = (Object[]) callUDF(url); + + assertEquals("Result array has wrong length", 7, res.length); + + assertEquals("baseName does not match", baseName, res[0]); + + assertEquals("is_original does not match", classification == Classification.ORIGINAL, res[1]); + assertEquals("is_high_quality does not match", classification == Classification.TRANSCODED_TO_AUDIO, res[2]); + assertEquals("is_low_quality does not match", classification == Classification.TRANSCODED_TO_IMAGE, res[3]); + assertEquals("is_low_quality does not match", classification == Classification.TRANSCODED_TO_MOVIE, res[4]); + + if (width == null) { + assertNull("width is not null", res[5]); + } else { + assertEquals("width does not match", width, res[5]); + } + + if (height == null) { + assertNull("height is not null", res[6]); + } else { + assertEquals("height does not match", height, res[6]); + } + } + + public void testInitialize() throws HiveException, IOException { + MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF(); + try { + udf.initialize(new ObjectInspector[]{StringOI}); + } finally { + udf.close(); + } + } + + public void testInitializeEmpty() throws HiveException, IOException { + MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF(); + try { + udf.initialize(new ObjectInspector[]{}); + fail("Initialize did not throw HiveException"); + } catch (HiveException e) { + } finally { + udf.close(); + } + } + + public void testInitializeWrongType() throws HiveException, IOException { + MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF(); + try { + udf.initialize(new ObjectInspector[]{LongOI}); + fail("Initialize did not throw HiveException"); + } catch (HiveException e) { + } finally { + udf.close(); + } + } + + public void testEvaluateUnknown() throws HiveException, IOException { + assertOutput("foo", null, Classification.UNKNOWN, null, null); + } + + public void testEvaluateOriginal() throws HiveException, IOException { + assertOutput("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png", + "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png", + Classification.ORIGINAL, null, null); + } + + public void testEvaluateAudio() throws HiveException, IOException { + assertOutput( + "/wikipedia/commons/transcoded/b/bd/Xylophone_jingle.wav/Xylophone_jingle.wav.ogg", + "/wikipedia/commons/b/bd/Xylophone_jingle.wav", + Classification.TRANSCODED_TO_AUDIO, null, null); + } + + public void testEvaluateImageWithWidth() throws HiveException, IOException { + assertOutput( + "/wikipedia/commons/thumb/a/ae/Flag_of_the_United_Kingdom.svg/1024px-Flag_of_the_United_Kingdom.svg.png", + "/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg", + Classification.TRANSCODED_TO_IMAGE, 1024, null); + } + + public void testEvaluateImageWithoutWidth() throws HiveException, IOException { + assertOutput( + "/wikipedia/commons/thumb/a/ae/Flag_of_the_United_Kingdom.svg/mid-Flag_of_the_United_Kingdom.svg.png", + "/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg", + Classification.TRANSCODED_TO_IMAGE, null, null); + } + + public void testEvaluateMovie() throws HiveException, IOException { + assertOutput( + "/wikipedia/commons/transcoded/3/31/Lheure_du_foo.ogv/Lheure_du_foo.ogv.360p.webm", + "/wikipedia/commons/3/31/Lheure_du_foo.ogv", + Classification.TRANSCODED_TO_MOVIE, null, 360); + } + + public void testEncodedInput() throws HiveException, IOException { + assertOutput( + "/wikipedia/commons/a/ae/F%6F%6f.svg", + "/wikipedia/commons/a/ae/Foo.svg", + Classification.ORIGINAL, null, null); + } + + public void testEncodedOutput() throws HiveException, IOException { + assertOutput( + "/wikipedia/commons/a/ae/F\no—o.svg", + "/wikipedia/commons/a/ae/F%0Ao%E2%80%94o.svg", + Classification.ORIGINAL, null, null); + } +} \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/189981 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I1b76e1e331ea781aee13557fc55a2c19ce5744a7 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits