Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/189981

to review the following change.

Change subject: Add parser for media file urls
......................................................................

Add parser for media file urls

Change-Id: I1b76e1e331ea781aee13557fc55a2c19ce5744a7
---
M changelog.md
A 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java
A 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java
A 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java
A 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java
M 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java
8 files changed, 1,655 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/81/189981/1

diff --git a/changelog.md b/changelog.md
index 76ac26a..5733c29 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,6 +1,7 @@
 ## v0.0.6-SNAPSHOT
 * Add custom percent en-/decoders to ease URL normalization.
 * Add Referer classifier
+* Add parser for media file urls
 
 ## v0.0.5
 * For geocoding, allow to specify the MaxMind databases that should get used.
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java
new file mode 100644
index 0000000..ab972f9
--- /dev/null
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlInfo.java
@@ -0,0 +1,132 @@
+package org.wikimedia.analytics.refinery.core;
+
+public class MediaFileUrlInfo {
+
+    public enum Classification {
+        UNKNOWN,
+        ORIGINAL,
+        TRANSCODED_TO_AUDIO,
+        TRANSCODED_TO_IMAGE,
+        TRANSCODED_TO_MOVIE,
+    }
+
+    private String baseName;
+    private Classification classification;
+    private Integer width;
+    private Integer height;
+
+    public static MediaFileUrlInfo createUnknown() {
+        return new MediaFileUrlInfo(null, Classification.UNKNOWN,
+                null, null);
+    }
+
+    public static MediaFileUrlInfo createOriginal(final String baseName) {
+        return new MediaFileUrlInfo(baseName, Classification.ORIGINAL,
+                null, null);
+    }
+
+    public static MediaFileUrlInfo createTranscodedToImage(
+            final String baseName, final Integer width) {
+        return new MediaFileUrlInfo(baseName,
+                Classification.TRANSCODED_TO_IMAGE, width, null);
+    }
+
+    public static MediaFileUrlInfo createTranscodedToMovie(
+            final String baseName, final int height) {
+        return new MediaFileUrlInfo(baseName,
+                Classification.TRANSCODED_TO_MOVIE, null, height);
+    }
+
+    public static MediaFileUrlInfo createTranscodedToAudio(
+            final String baseName) {
+        return new MediaFileUrlInfo(baseName,
+                Classification.TRANSCODED_TO_AUDIO, null, null);
+    }
+
+    private MediaFileUrlInfo(final String baseName,
+            final Classification quality, final Integer width,
+            final Integer height) {
+        this.baseName = baseName;
+        this.classification = quality;
+        this.width = width;
+        this.height = height;
+    }
+
+    public String getBaseName() {
+        return baseName;
+    }
+
+    public Classification getClassification() {
+        return classification;
+    }
+
+    public Integer getWidth() {
+        return width;
+    }
+
+    public Integer getHeight() {
+        return height;
+    }
+
+    @Override
+    public boolean equals(final Object obj) {
+        boolean ret = false;
+
+        if (obj instanceof MediaFileUrlInfo) {
+            MediaFileUrlInfo other =
+                    (MediaFileUrlInfo) obj;
+
+            ret = true;
+
+            ret &= classification == other.classification;
+
+            if (baseName == null) {
+                ret &= other.baseName == null;
+            } else {
+                ret &= baseName.equals(other.baseName);
+            }
+
+            if (width == null) {
+                ret &= other.width == null;
+            } else {
+                ret &= width.equals(other.width);
+            }
+
+            if (height == null) {
+                ret &= other.height == null;
+            } else {
+                ret &= height.equals(other.height);
+            }
+        }
+
+        return ret;
+    }
+
+    @Override
+    public String toString() {
+        String ret = "MediaFileUrlInfo[";
+        switch (classification) {
+        case UNKNOWN:
+            ret += "unknown";
+            break;
+        case ORIGINAL:
+            ret += baseName;
+            ret += ", original";
+            break;
+        case TRANSCODED_TO_AUDIO:
+            ret += baseName;
+            ret += ", transcoded to audio";
+            break;
+        case TRANSCODED_TO_IMAGE:
+            ret += baseName;
+            ret += ", transcoded to image, width: " + width;
+            break;
+        case TRANSCODED_TO_MOVIE:
+            ret += baseName;
+            ret += ", transcoded to movie, height: " + height;
+            break;
+        }
+        ret += "]";
+        return ret;
+    }
+}
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java
new file mode 100644
index 0000000..0574f96
--- /dev/null
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/MediaFileUrlParser.java
@@ -0,0 +1,262 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * Parse an url for MediaFileUrlInfo
+ */
+public class MediaFileUrlParser {
+    /**
+     * Pattern to match wikis within other patterns
+     */
+    private static Pattern wikiPattern = Pattern.compile("[a-z_-]{2,}[0-9]*");
+
+    /**
+     * Pattern to match math urls
+     */
+    private static Pattern mathPattern = Pattern.compile(
+            "/math"
+            + "/([0-9-a-f])"
+            + "/([0-9-a-f])"
+            + "/([0-9-a-f])"
+            + "/\\1\\2\\3[0-9-a-f]{29}\\.png");
+
+    private static Pattern mathPerWikiPattern = Pattern.compile(
+            "/[^/]*/" + wikiPattern.pattern() + mathPattern.pattern());
+
+    /**
+     * Pattern to match score urls
+     */
+    private static Pattern scorePattern = Pattern.compile(
+            "(/score"
+            + "/([0-9a-z])"
+            + "/([0-9a-z])"
+            + "/(\\2\\3[0-9a-z]{6})[0-9a-z]{23}/\\4\\.)((png)|(ogg|midi))");
+
+    /**
+     * Pattern to match timeline urls
+     */
+    private static Pattern timelinePattern = Pattern.compile(
+            "/[^/]*/" + wikiPattern.pattern() + 
"/timeline/[0-9-a-f]{32}\\.png");
+
+    /**
+     * Pattern to match urls for plain uploaded media files
+     */
+    private static Pattern uploadedPattern = Pattern.compile(
+            "(/[^/]*/" + wikiPattern.pattern() + ")"  // group 1: project
+            + "(?:/(thumb|transcoded))?"        // group 2: Markers for 
transcodings
+            + "(/archive|/temp)?"            // group 3: Needed to construct 
basename
+            + "(/([0-9-a-f])/\\5[0-9-a-f])"  // groups 4+5: Hash. Needed for 
backref, and to construct basename
+            + 
"/(?:([12][0-9]{3}[01][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-6][0-9])(?:!|%21))?"
 // group 6: timestamp
+            + "(([^/]*?)(?:\\.[^./]*)?)"     // group 7: the main file name
+                                             // group 8: them main file name 
without suffix (such as ".png")
+            + "(/"                           // group 9: the whole transcoding 
spec
+                + "(?:lossy-)?"              // If transcoding is marked lossy 
(Like a single page of a tiff -> jpeg)
+                + "(?:lossless-)?"           // If transcoding is marked 
lossless (Like a single page of a tiff -> png)
+                + "(?:page[0-9]+-)?"         // For single page transcodings 
of a multi-page original (Like tiff -> png, pdf-> png)
+                + "(?:lang[a-z-]*-)?"        // Rendering only a single 
language of a multi-language original (Like svg -> png)
+                + "(?:(?:qlow|mid)-)?"       // Quality markers with undefined 
width/height
+                + "(?:0*([1-9]+[0-9]*)px-)?" // group 10: Thumbnail pixel 
width (like 120px)
+                + "(?:seek(?:=|%3D)[0-9]+(?:\\.[0-9]*)?)?" // Seeking to a 
timestamp in a video (When transcoding movies to images)
+                + "-?"                       // This is the "-" that separates 
the prepended options from the thumbnail name.
+                + "(?:\\7|\\8|thumbnail(?:\\.(?:djvu|ogv|pdf|svg|tiff?))?)" // 
main thumbnail name
+                + "(?:\\.0*([1-9][0-9]*)p)?" // group 11: Transcoding height
+                + "(?:\\.(?:"                // Ending of transcoded output 
format for:
+                    + "(ogg)"                //   group 12: audio files
+                    + "|(gif|jpe?g|png)"     //   group 13: images
+                    + "|(webm|ogv)"          //   group 14: movies
+                + "))"
+            + ")?");
+
+    /**
+     * Parses a string of digits to a bounded Integer, possibly null
+     * <p/>
+     * If the string of digits it too to fit in an Integer, the maximum 
possible
+     * Integer is returned.
+     *
+     * @param digits The string of digits to parse integer
+     * @return Integer null, if str is null. Otherwise a Integer in
+     *   [0, Integer.MAX_VALUE]
+     */
+    private static Integer parseDigitString(String digits) {
+        Integer ret = null;
+        if (digits != null) {
+            try {
+                ret = Integer.parseInt(digits);
+            } catch (NumberFormatException e) {
+                // Since digits is required to be a string of digits, the only 
way a NumberFormatException can be thrown is that the number is too big. Hence, 
we bound the the maximum possible integer.
+                ret = Integer.MAX_VALUE;
+            }
+        }
+        return ret;
+    }
+
+    /**
+     * Parses information out of a url for media files in the upload domain
+     *
+     * @param url The url to parse
+     * @return IdentifyMediaFileUrlInfo holding the parsed data.
+     *   null if parsing failed.
+     */
+    public static MediaFileUrlInfo parse(String url) {
+        final MediaFileUrlInfo ret;
+
+        if (url == null) {
+            return null;
+        }
+
+        String uriPath;
+
+        if (url.startsWith("http://upload.wikimedia.org/";)) {
+            uriPath = url.substring(27);
+        } else if (url.startsWith("https://upload.wikimedia.org/";)) {
+            uriPath = url.substring(28);
+        } else if (url.startsWith("/")) {
+            uriPath = url;
+        } else {
+            return null;
+        }
+
+        // url was either protocol- and domain-less, or it is valid for upload.
+        assert uriPath.startsWith("/") : "uriPath does not start in \"/\", but 
is " + uriPath;
+
+        uriPath = PercentDecoder.decode(uriPath);
+        uriPath = uriPath.replaceAll("//+", "/");
+        uriPath = uriPath.trim();
+
+        String[] uriPathParts = StringUtils.split(uriPath, '/');
+
+        assert uriPathParts != null : "Split gave null array";
+
+        if (uriPathParts.length < 1) {
+            return null;
+        }
+
+        switch (uriPathParts[0]) {
+        case "math":
+            if (mathPattern.matcher(uriPath).matches()){
+                ret = MediaFileUrlInfo.createOriginal(uriPath);
+            } else {
+                return null;
+            };
+            break;
+        case "score":
+            Matcher matcher = scorePattern.matcher(uriPath);
+            if (matcher.matches()) {
+                String baseName = matcher.group(1) + "png";
+                if (matcher.group(6) != null) {
+                    ret = MediaFileUrlInfo.createTranscodedToImage(
+                            baseName, null);
+                } else if (matcher.group(7) != null) {
+                    ret = MediaFileUrlInfo.createTranscodedToAudio(baseName);
+                } else {
+                    throw new AssertionError("Logic error due to score having "
+                            + "both group 6 and 7 empty  ('" + uriPath + "')");
+                }
+            } else {
+                return null;
+            };
+            break;
+        case "wikibooks":
+        case "wikinews":
+        case "wikimedia":
+        case "wikipedia":
+        case "wikiquote":
+        case "wikisource":
+        case "wikiversity":
+        case "wikivoyage":
+        case "wiktionary":
+            Matcher imageMatcher = uploadedPattern.matcher(uriPath);
+            if (imageMatcher.matches()) {
+                String project = imageMatcher.group(1);
+                String transcoding = imageMatcher.group(2);
+                String  timestampFlag = imageMatcher.group(3);
+                String hash = imageMatcher.group(4);
+                // No group 5, as that group holds the first hexadecimal digit.
+                String timestamp = imageMatcher.group(6);
+                String file = imageMatcher.group(7);
+                // No group 8, as that group holds the file without suffix.
+                String transcodingSpec = imageMatcher.group(9);
+                String widthStr = imageMatcher.group(10);
+                String heightStr = imageMatcher.group(11);
+                String transcodedAudioSuffix = imageMatcher.group(12);
+                String transcodedImageSuffix = imageMatcher.group(13);
+                String transcodedMovieSuffix = imageMatcher.group(14);
+
+                // Setting basename
+                final String baseName;
+                if (timestampFlag == null) {
+                    baseName = project + hash + '/' + file;
+                } else if ("/archive".equals(timestampFlag)) {
+                    baseName = project + timestampFlag + hash + '/'
+                            + timestamp + '!' + file;
+                } else if ("/temp".equals(timestampFlag)) {
+                    // Note that the timestamp is matched within the file, so
+                    // no need to add the timestamp here.
+                    baseName = project + timestampFlag + hash + '/' + file;
+                } else {
+                    throw new AssertionError("Logic error due to timestampFlag"
+                            + " '" + timestampFlag + "' not being handled");
+                }
+
+                if (transcoding == null && transcodingSpec == null) {
+                    ret = MediaFileUrlInfo.createOriginal(baseName);
+                } else if (transcoding != null && transcodingSpec != null) {
+                    if ("thumb".equals(transcoding)
+                            || (transcodedImageSuffix != null)) {
+                        Integer width = parseDigitString(widthStr);
+                        ret = 
MediaFileUrlInfo.createTranscodedToImage(baseName, width);
+                    } else if (transcodedAudioSuffix != null) {
+                        ret = 
MediaFileUrlInfo.createTranscodedToAudio(baseName);
+                    } else if (transcodedMovieSuffix != null) {
+                        Integer height = parseDigitString(heightStr);
+                        ret = 
MediaFileUrlInfo.createTranscodedToMovie(baseName, height);
+                    } else {
+                        throw new AssertionError("Logic error due to"
+                                + "transcodingSpec without a suffix specific 
handler '"
+                                + transcodingSpec + "'");
+                    }
+                } else {
+                    return null;
+                }
+            } else if (timelinePattern.matcher(uriPath).matches()) {
+                ret = MediaFileUrlInfo.createOriginal(uriPath);
+            } else if (mathPerWikiPattern.matcher(uriPath).matches()) {
+                ret = MediaFileUrlInfo.createOriginal(uriPath);
+            } else {
+                return null;
+            };
+            break;
+        case "favicon.ico":
+            if (("/" + uriPathParts[0]).equals(uriPath)) {
+                ret = MediaFileUrlInfo.createOriginal(uriPath);
+            } else {
+                return null;
+            }
+            break;
+        default:
+            return null;
+        }
+
+        assert (ret != null) : "Logic error, as info is still not set";
+
+        return ret;
+    }
+}
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java
new file mode 100644
index 0000000..fe7833a
--- /dev/null
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlInfo.java
@@ -0,0 +1,323 @@
+package org.wikimedia.analytics.refinery.core;
+
+import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo.Classification;
+
+import junit.framework.TestCase;
+
+public class TestMediaFileUrlInfo extends TestCase {
+
+    public void assertEquals(String message, int expected, Integer actual) {
+        assertEquals(message, new Integer(expected), actual);
+    }
+
+    public void assertContainsIgnoreCase(String hayStack, String needle) {
+        assertTrue("'" + hayStack + "' does not contain '" + needle + "'",
+                hayStack.toLowerCase().contains(needle.toLowerCase()));
+    }
+
+    // Factory methods --------------------------------------------------------
+
+    public void testCreateUnknown() {
+        MediaFileUrlInfo info = MediaFileUrlInfo.createUnknown();
+
+        assertNull("Base name not null", info.getBaseName());
+        assertEquals("Classification does not match",
+                Classification.UNKNOWN, info.getClassification());
+        assertNull("Width not null", info.getWidth());
+        assertNull("Height not null", info.getHeight());
+    }
+
+    public void testCreateOriginal() {
+        MediaFileUrlInfo info = MediaFileUrlInfo.createOriginal("foo");
+
+        assertEquals("Base name does not match", "foo", info.getBaseName());
+        assertEquals("Classification does not match",
+                Classification.ORIGINAL, info.getClassification());
+        assertNull("Width not null for unknown", info.getWidth());
+        assertNull("Height not null for unknown", info.getHeight());
+    }
+
+    public void testCreateTranscodedToAudio() {
+        MediaFileUrlInfo info = 
MediaFileUrlInfo.createTranscodedToAudio("foo");
+
+        assertEquals("Base name does not match", "foo", info.getBaseName());
+        assertEquals("Classification does not match",
+                Classification.TRANSCODED_TO_AUDIO, info.getClassification());
+        assertNull("Width not null for unknown", info.getWidth());
+        assertNull("Height not null for unknown", info.getHeight());
+    }
+
+    public void testCreateTranscodedToImage() {
+        MediaFileUrlInfo info = 
MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+
+        assertEquals("Base name does not match", "foo", info.getBaseName());
+        assertEquals("Classification does not match",
+                Classification.TRANSCODED_TO_IMAGE, info.getClassification());
+        assertEquals("Width does not match", 42, info.getWidth());
+        assertNull("Height not null", info.getHeight());
+    }
+
+    public void testCreateTranscodedToMovie() {
+        MediaFileUrlInfo info = 
MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+
+        assertEquals("Base name does not match", "foo", info.getBaseName());
+        assertEquals("Classification does not match",
+                Classification.TRANSCODED_TO_MOVIE, info.getClassification());
+        assertNull("Width not null", info.getWidth());
+        assertEquals("Height does not match", 42, info.getHeight());
+    }
+
+    // Instance methods -------------------------------------------------------
+
+    // Equals .................................................................
+
+    public void testUnknowEqualsNull() {
+        MediaFileUrlInfo info = MediaFileUrlInfo.createUnknown();
+
+        boolean actual = info.equals(null);
+
+        assertFalse("Proper instance equals null", actual);
+    }
+
+    public void testUnknownEqualsSame() {
+        MediaFileUrlInfo info = MediaFileUrlInfo.createUnknown();
+
+        boolean actual = info.equals(info);
+
+        assertTrue("Proper instance does not equal itself", actual);
+    }
+
+    public void testUnknownEqualsUnknown() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createUnknown();
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createUnknown();
+
+        boolean actual = infoA.equals(infoB);
+
+        assertTrue("Two unknown not equal", actual);
+    }
+
+    public void testOriginalEqualsUnknown() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo");
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createUnknown();
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Original and unknown equal", actual);
+    }
+
+    public void testOriginalEqualsOriginal() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo");
+
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("foo");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertTrue("Originals not equal", actual);
+    }
+
+    public void testOriginalEqualsOriginalDifferentBaseName() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo");
+
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("bar");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Base name does not disambiguate", actual);
+    }
+
+    public void testOriginalEqualsOriginalBaseNameOtherNull() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal("foo");
+
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal(null);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("BaseName does not disambiguate", actual);
+    }
+
+    public void testOriginalEqualsOriginalBaseNameThisNull() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createOriginal(null);
+
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("foo");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("BaseName does not disambiguate", actual);
+    }
+
+    public void testAudioEqualsAudio() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToAudio("foo");
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToAudio("foo");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertTrue("Audio files to not equal", actual);
+    }
+
+    public void testAudioEqualsAudioDifferentBaseName() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToAudio("foo");
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToAudio("bar");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Base name does not disambiguate", actual);
+    }
+
+    public void testAudioEqualsOriginal() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToAudio("foo");
+        MediaFileUrlInfo infoB = MediaFileUrlInfo.createOriginal("foo");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Audio and original equal", actual);
+    }
+
+    public void testImageEqualsImage() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertTrue("Images not equal", actual);
+    }
+
+    public void testImageEqualsAudio() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToAudio("foo");
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Image and audio equal", actual);
+    }
+
+    public void testImageEqualsImageDifferentBaseName() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToImage("bar", 42);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Base name does not disambiguate", actual);
+    }
+
+    public void testImageEqualsImageDifferentWidth() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 43);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Width does not disambiguate", actual);
+    }
+
+    public void testMovieEqualsImage() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToImage("foo", 42);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Movie and image equal", actual);
+    }
+
+    public void testMovieEqualsMovie() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertTrue("Movies not equal", actual);
+    }
+
+    public void testMovieEqualsMovieDifferentBaseName() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToMovie("bar", 42);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Base name does not disambiguate", actual);
+    }
+
+    public void testMovieEqualsMovieDifferentHeight() {
+        MediaFileUrlInfo infoA =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 43);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Height does not disambiguate", actual);
+    }
+
+    public void testUnknownEqualsMovie() {
+        MediaFileUrlInfo infoA = MediaFileUrlInfo.createUnknown();
+        MediaFileUrlInfo infoB =
+                MediaFileUrlInfo.createTranscodedToMovie("foo", 42);
+
+        boolean actual = infoA.equals(infoB);
+
+        assertFalse("Unknown and movie equal", actual);
+    }
+
+    // toString ...............................................................
+
+    public void testUnknown() {
+        String stringRep = MediaFileUrlInfo.createUnknown().toString();
+
+        assertContainsIgnoreCase(stringRep, "unknown");
+    }
+
+    public void testOriginal() {
+        String stringRep = MediaFileUrlInfo.createOriginal("foo").toString();
+
+        assertContainsIgnoreCase(stringRep, "original");
+        assertContainsIgnoreCase(stringRep, "foo");
+    }
+
+    public void testAudio() {
+        String stringRep = 
MediaFileUrlInfo.createTranscodedToAudio("foo").toString();
+
+        assertContainsIgnoreCase(stringRep, "audio");
+        assertContainsIgnoreCase(stringRep, "foo");
+    }
+
+    public void testImageWidthNonNull() {
+        String stringRep = MediaFileUrlInfo.createTranscodedToImage("foo", 
42).toString();
+
+        assertContainsIgnoreCase(stringRep, "image");
+        assertContainsIgnoreCase(stringRep, "foo");
+        assertContainsIgnoreCase(stringRep, "42");
+    }
+
+    public void testImageWidthNull() {
+        String stringRep = MediaFileUrlInfo.createTranscodedToImage("foo", 
null).toString();
+
+        assertContainsIgnoreCase(stringRep, "image");
+        assertContainsIgnoreCase(stringRep, "foo");
+        assertContainsIgnoreCase(stringRep, "null");
+    }
+
+    public void testMovie() {
+        String stringRep = MediaFileUrlInfo.createTranscodedToMovie("foo", 
42).toString();
+
+        assertContainsIgnoreCase(stringRep, "movie");
+        assertContainsIgnoreCase(stringRep, "foo");
+        assertContainsIgnoreCase(stringRep, "42");
+    }
+}
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java
new file mode 100644
index 0000000..dd9e216
--- /dev/null
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestMediaFileUrlParser.java
@@ -0,0 +1,615 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import junit.framework.TestCase;
+
+public class TestMediaFileUrlParser extends TestCase {
+
+    // Helper methods ---------------------------------------------------------
+
+    private void assertParsed(final String url,
+            final MediaFileUrlInfo expected) {
+        MediaFileUrlInfo actual = MediaFileUrlParser.parse(url);
+
+        assertEquals("Parsed info does not equal expected", expected, actual);
+    }
+
+    private void assertUnidentified(final String url) {
+        assertParsed(url, null);
+    }
+
+    private void assertOriginal(final String url,
+            final String baseName) {
+        assertParsed(url, MediaFileUrlInfo.createOriginal(baseName));
+    }
+
+    private void assertOriginal(final String url) {
+        assertOriginal(url, url);
+    }
+
+    private void assertImage(final String url,
+            final String baseName, final Integer width) {
+        assertParsed(url,
+                MediaFileUrlInfo.createTranscodedToImage(baseName, width));
+    }
+
+    private void assertMovie(final String url,
+            final String baseName, final int height) {
+        assertParsed(url,
+                MediaFileUrlInfo.createTranscodedToMovie(baseName, height));
+    }
+
+    private void assertAudio(final String url, final String baseName) {
+        assertParsed(url, MediaFileUrlInfo.createTranscodedToAudio(baseName));
+    }
+
+    // Test degenerate settings -----------------------------------------------
+
+    public void testNull() {
+        assertUnidentified(null);
+    }
+
+    public void testEmpty() {
+        assertUnidentified("");
+    }
+
+    public void testPlainSlash() {
+        assertUnidentified("/");
+    }
+
+    public void testLongPixelStringLowResolution() {
+        assertImage(
+                
"/wikipedia/commons/thumb/8/83/Kit_body.svg/00000000000000000000000000000000000000000000000000000000000000000000000000000000000000001px-Kit_body.svg.png",
+                "/wikipedia/commons/8/83/Kit_body.svg", 1);
+    }
+
+    public void testLongPixelStringHighResolution() {
+        assertImage(
+                
"/wikipedia/commons/thumb/8/83/Kit_body.svg/10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000px-Kit_body.svg.png",
+                "/wikipedia/commons/8/83/Kit_body.svg",
+                Integer.MAX_VALUE);
+    }
+
+    // Test protocols ---------------------------------------------------------
+
+    public void testNoProtocolNoLeadingSlash() {
+        assertUnidentified("math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testNoProtocolLeadingSlash() {
+        assertOriginal("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testHttp() {
+        assertOriginal(
+                
"http://upload.wikimedia.org/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png";,
+                "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testHttpPlainSlash() {
+        assertUnidentified("http://upload.wikimedia.org/";);
+    }
+
+    public void testHttps() {
+        assertOriginal(
+                
"https://upload.wikimedia.org/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png";,
+                "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testHttpsPlainSlash() {
+        assertUnidentified("https://upload.wikimedia.org/";);
+    }
+
+    // Test uri cleanup -------------------------------------------------------
+
+    public void testSlashTrimming() {
+        assertOriginal(
+                "/math/d/a///9//da9d325123d50dbc4e36363f2863ce3e.png",
+                "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testDecodingThumbNoneEncoded() {
+        assertImage(
+                
"/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg/240px-Japan_on_the_globe_(claimed)_(Japan_centered).svg.png",
+                
"/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg",
+                240);
+    }
+
+    public void testDecodingThumbBothEncoded() {
+        assertImage(
+                
"/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg/240px-Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg.png",
+                
"/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg",
+                240);
+    }
+
+    public void testDecodingThumbOnlyMainPartEncoded() {
+        assertImage(
+                
"/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg/240px-Japan_on_the_globe_(claimed)_(Japan_centered).svg.png",
+                
"/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg",
+                240);
+    }
+
+    public void testDecodingThumbOnlyThumbEncoded() {
+        assertImage(
+                
"/wikipedia/commons/thumb/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg/240px-Japan_on_the_globe_%28claimed%29_%28Japan_centered%29.svg.png",
+                
"/wikipedia/commons/7/7a/Japan_on_the_globe_(claimed)_(Japan_centered).svg",
+                240);
+    }
+
+    public void testTrimming() {
+        assertImage(
+                
"/wikipedia/commons/thumb/a/ae/Essig-1.jpg/459px-Essig-1.jpg%20%20%20",
+                "/wikipedia/commons/a/ae/Essig-1.jpg", 459);
+    }
+
+    // Test static assets -----------------------------------------------------
+
+    public void testFavicon() {
+        assertOriginal("/favicon.ico");
+    }
+
+    public void testFaviconWithPathlessSuffix() {
+        assertUnidentified("/favicon.icofoo");
+    }
+
+    public void testFaviconWithPathedSuffix() {
+        assertUnidentified("/favicon.ico/foo");
+    }
+
+    // Test math images -------------------------------------------------------
+
+    public void testMathPlain() {
+        assertOriginal("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathNonHexFirstUrlPart() {
+        assertUnidentified("/math/X/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathNonHexSecondUrlPart() {
+        assertUnidentified("/math/d/Y/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathNonHexThirdUrlPart() {
+        assertUnidentified("/math/d/a/Z/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathNonHexHash() {
+        assertUnidentified("/math/d/a/9/da9d32512Qd50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathHashTooLong() {
+        
assertUnidentified("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e0.png");
+    }
+
+    public void testMathHashTooShort() {
+        assertUnidentified("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3.png");
+    }
+
+    public void testMathFirstHashDigitMismatch() {
+        assertUnidentified("/math/0/a/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathSecondHashDigitMismatch() {
+        assertUnidentified("/math/d/0/9/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathThirdHashDigitMismatch() {
+        assertUnidentified("/math/d/a/0/da9d325123d50dbc4e36363f2863ce3e.png");
+    }
+
+    public void testMathPerWikiPlain() {
+        
assertOriginal("/wikipedia/en/math/5/6/a/56a5d0fae0136327e61476dcfe43109a.png");
+    }
+
+    // Test score -------------------------------------------------------------
+
+    public void testScore() {
+        assertImage(
+                "/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png",
+                "/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png", 
null);
+    }
+
+    public void testScoreNonAlphaNumFirstPart() {
+        
assertUnidentified("/score/-/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png");
+    }
+
+    public void testScoreNonAlphaNumSecondPart() {
+        
assertUnidentified("/score/7/-/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png");
+    }
+
+    public void testScoreNonAlphaNumThirdPart() {
+        
assertUnidentified("/score/7/a/7ae-9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png");
+    }
+
+    public void testScoreNonAlphaNumFourthPart() {
+        
assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jw-.png");
+    }
+
+    public void testScoreNonMatchingFirstPart() {
+        
assertUnidentified("/score/8/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png");
+    }
+
+    public void testScoreNonMatchingSecondPart() {
+        
assertUnidentified("/score/7/b/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png");
+    }
+
+    public void testScoreNonMatchingThirdPart() {
+        
assertUnidentified("/score/7/a/7aeg9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jww.png");
+    }
+
+    public void testScoreNonMatchingFourthPart() {
+        
assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem0jww.png");
+    }
+
+    public void testScoreTooLongThirdPart() {
+        
assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2bc/7aem9jww.png");
+    }
+
+    public void testScoreTooLongFourthPart() {
+        
assertUnidentified("/score/7/a/7aem9jwwirkhn0ucbewj9gs7aofzc2b/7aem9jwwi.png");
+    }
+
+    public void testScoreOgg() {
+        assertAudio("/score/q/0/q0bopydzemuz315z4n6dvg8sfu8qsu0/q0bopydz.ogg",
+                "/score/q/0/q0bopydzemuz315z4n6dvg8sfu8qsu0/q0bopydz.png");
+    }
+
+    public void testScoreMidi() {
+        assertAudio("/score/k/7/k7yj1lvc3fqecbmknn497haqj6x9g2y/k7yj1lvc.midi",
+                "/score/k/7/k7yj1lvc3fqecbmknn497haqj6x9g2y/k7yj1lvc.png");
+    }
+
+    // Test timeline image ----------------------------------------------------
+
+    public void testTimeline() {
+        
assertOriginal("/wikipedia/en/timeline/12435a102adebdee9059bc97bb652af1.png");
+    }
+
+    // Test uploaded media files ----------------------------------------------
+
+    public void testMediaMeta() {
+        assertOriginal("/wikipedia/meta/7/74/Wikibooks-logo_sister_1x.png");
+    }
+
+    public void testMediaNonHexFirstPart() {
+        
assertUnidentified("/wikipedia/meta/X/74/Wikibooks-logo_sister_1x.png");
+    }
+
+    public void testMediaNonHexSecondPart() {
+        
assertUnidentified("/wikipedia/meta/7/7X/Wikibooks-logo_sister_1x.png");
+    }
+
+    public void testMediaFirstAndSecondPartMismatch() {
+        
assertUnidentified("/wikipedia/meta/7/84/Wikibooks-logo_sister_1x.png");
+    }
+
+    public void testMediaCommons() {
+        assertOriginal("/wikipedia/commons/d/dd/Fumiyuki_Beppu_Giro_2011.jpg");
+    }
+
+    public void testMediaWikibooks() {
+        assertOriginal("/wikibooks/en/b/bc/Wiki.png");
+    }
+
+    public void testMediaWiktionary() {
+        assertOriginal("/wiktionary/fr/b/bc/Wiki.png");
+    }
+
+    public void testMediaWikinews() {
+        assertOriginal("/wikinews/en/f/f7/Twitter.png");
+    }
+
+    public void testMediaWikiquote() {
+        assertOriginal("/wikiquote/en/b/bc/Wiki.png");
+    }
+
+    public void testMediaWikisource() {
+        assertOriginal("/wikisource/ar/d/dd/Foo.pdf");
+    }
+
+    public void testMediaWikiversity() {
+        assertOriginal("/wikiversity/ru/b/b6/Diffuziya_v_menzurke.jpg");
+    }
+
+    public void testMediaWikivoyage() {
+        assertOriginal("/wikivoyage/ru/c/ce/Map_mag.png");
+    }
+
+    public void testMediaWikimedia() {
+        assertImage(
+                
"/wikimedia/pl/thumb/4/47/Spraw_2010_OPP.pdf/page21-180px-Spraw_2010_OPP.pdf.jpg",
+                "/wikimedia/pl/4/47/Spraw_2010_OPP.pdf", 180);
+    }
+
+    public void testMediaWikimania2014() {
+        assertImage(
+                
"/wikipedia/wikimania2014/thumb/a/ae/Rufus_Pollock.png/293px-Rufus_Pollock.png",
+                "/wikipedia/wikimania2014/a/ae/Rufus_Pollock.png", 293);
+    }
+
+    // Test uploaded media files; Thumbs --------------------------------------
+
+    public void testMediaThumbLowQuality() {
+        assertImage(
+                
"/wikipedia/it/thumb/0/0d/Venosa-Stemma.png/50px-Venosa-Stemma.png",
+                "/wikipedia/it/0/0d/Venosa-Stemma.png", 50);
+    }
+
+    public void testMediaThumbHighQuality() {
+        assertImage(
+                
"/wikipedia/commons/thumb/0/01/USS_Texas_BB-35_aircastle.jpg/1024px-USS_Texas_BB-35_aircastle.jpg",
+                "/wikipedia/commons/0/01/USS_Texas_BB-35_aircastle.jpg", 1024);
+    }
+
+    public void testMediaThumbGif() {
+        assertImage(
+                
"/wikipedia/ar/thumb/c/c1/Logo_of_the_African_Union.png/60px-Logo_of_the_African_Union.png.gif",
+                "/wikipedia/ar/c/c1/Logo_of_the_African_Union.png", 60);
+    }
+
+    public void testMediaThumbPngJpeg() {
+        assertImage(
+                
"/wikipedia/ru/thumb/2/29/MagicDepartment.png/240px-MagicDepartment.png.jpeg",
+                "/wikipedia/ru/2/29/MagicDepartment.png", 240);
+    }
+
+    public void testMediaThumbDjvu() {
+        assertImage(
+                
"/wikipedia/commons/thumb/b/b5/foo.djvu/page1-800px-thumbnail.djvu.jpg",
+                "/wikipedia/commons/b/b5/foo.djvu", 800);
+    }
+
+    public void testMediaThumbSvgPng() {
+        assertImage(
+                
"/wikipedia/commons/thumb/a/ae/Flag_of_the_United_Kingdom.svg/24px-Flag_of_the_United_Kingdom.svg.png",
+                "/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg", 24);
+    }
+
+    public void testMovieThumbWithoutFormatEnding() {
+        assertImage(
+                
"/wikipedia/commons/thumb/9/9f/Chicago_-_State_St_at_Madison_Ave%2C_1897.ogv/180px-Chicago_-_State_St_at_Madison_Ave%2C_1897.ogv",
+                
"/wikipedia/commons/9/9f/Chicago_-_State_St_at_Madison_Ave,_1897.ogv",
+                180);
+    }
+
+    // Test uploaded media files; Specialities --------------------------------
+
+    public void testMediaThumbQLow() {
+        assertImage(
+                
"/wikipedia/commons/thumb/8/8c/Google_Mountain_View_campus_garden.jpg/qlow-330px-Google_Mountain_View_campus_garden.jpg",
+                
"/wikipedia/commons/8/8c/Google_Mountain_View_campus_garden.jpg",
+                330);
+    }
+
+
+    public void testMediaThumbMid() {
+        assertImage(
+                
"/wikipedia/commons/thumb/7/7d/Will_Success_Spoil_Rock_Hunter_trailer.ogv/mid-Will_Success_Spoil_Rock_Hunter_trailer.ogv.jpg",
+                
"/wikipedia/commons/7/7d/Will_Success_Spoil_Rock_Hunter_trailer.ogv",
+                null);
+    }
+
+    public void testMediaSeek() {
+        assertImage(
+                
"/wikipedia/commons/thumb/3/3d/Suez_nationalization.ogv/seek%3D151-Suez_nationalization.ogv.jpg",
+                "/wikipedia/commons/3/3d/Suez_nationalization.ogv",
+                null);
+    }
+
+    public void testMediaSeekWithResolution() {
+        assertImage(
+                
"/wikipedia/commons/thumb/3/3d/Suez_nationalization.ogv/1000px-seek%3D151-Suez_nationalization.ogv.jpg",
+                "/wikipedia/commons/3/3d/Suez_nationalization.ogv", 1000);
+    }
+
+    public void testMediaThumbWithoutNameDuplication() {
+        assertImage(
+                
"/wikipedia/commons/thumb/8/8c/Google_Mountain_View_campus_garden.jpg/330px-thumbnail.jpg",
+                
"/wikipedia/commons/8/8c/Google_Mountain_View_campus_garden.jpg",
+                330);
+    }
+
+    public void testMediaThumbPdfWithoutNameDuplication() {
+        assertImage(
+                
"/wikipedia/commons/thumb/2/21/Quetzalcatl_-_Divindade_adorada_pelos_Asteca_Tolteca_e_Maias_quem_teria_no_s_originado_os_homens_como_tambm_providenciado_seu_principal_alimento_o_milho.pdf/page1-220px-thumbnail.pdf.jpg",
+                
"/wikipedia/commons/2/21/Quetzalcatl_-_Divindade_adorada_pelos_Asteca_Tolteca_e_Maias_quem_teria_no_s_originado_os_homens_como_tambm_providenciado_seu_principal_alimento_o_milho.pdf",
+                220);
+    }
+
+    public void testMediaThumbTifWithoutNameDuplication() {
+        assertImage(
+                
"/wikipedia/commons/thumb/7/72/EXTERIOR_DETAIL_VIEW_OF_THE_UMBRA_FROM_THE_SOUTH_-_Mark_Twain_House_351_Farmington_Avenue_corrected_from_original_address_of_531_Farmington_Avenue_Hartford_Hartford_HABS_CONN-HARF16-30.tif/lossy-page1-120px-thumbnail.tif.jpg",
+                
"/wikipedia/commons/7/72/EXTERIOR_DETAIL_VIEW_OF_THE_UMBRA_FROM_THE_SOUTH_-_Mark_Twain_House_351_Farmington_Avenue_corrected_from_original_address_of_531_Farmington_Avenue_Hartford_Hartford_HABS_CONN-HARF16-30.tif",
+                120);
+    }
+
+    public void testMediaThumbSvgWithoutNameDuplication() {
+        assertImage(
+                
"/wikipedia/commons/thumb/6/6e/ABS-6457.0-InternationalTradePriceIndexesAustralia-ExportPriceIndexBySitcIndexNumbersPercentageChanges-IndexNumbers-ManufacturedGoodsClassifiedChieflyByMaterial6-A2295543A.svg/300px-thumbnail.svg.png",
+                
"/wikipedia/commons/6/6e/ABS-6457.0-InternationalTradePriceIndexesAustralia-ExportPriceIndexBySitcIndexNumbersPercentageChanges-IndexNumbers-ManufacturedGoodsClassifiedChieflyByMaterial6-A2295543A.svg",
+                300);
+    }
+
+    public void testMediaThumbOgvWithoutNameDuplication() {
+        assertImage(
+                
"/wikipedia/commons/thumb/e/e8/Putin_talk_2011-12-15_00695-00810_....ogv/250px--thumbnail.ogv.jpg",
+                
"/wikipedia/commons/e/e8/Putin_talk_2011-12-15_00695-00810_....ogv",
+                250);
+    }
+
+    public void testMediaThumbTiffWithoutNameDuplication() {
+        assertImage(
+                
"/wikipedia/commons/thumb/a/a1/Queens__Vol._2..._NYPL1693954.tiff/lossy-page1-120px-thumbnail.tiff.jpg",
+                "/wikipedia/commons/a/a1/Queens__Vol._2..._NYPL1693954.tiff",
+                120);
+    }
+
+    public void testMediaThumbLangFr() {
+        assertImage(
+                
"/wikipedia/commons/thumb/8/85/Defaut.svg/langfr-250px-Defaut.svg.png",
+                "/wikipedia/commons/8/85/Defaut.svg", 250);
+    }
+
+    public void testMediaThumbLangFrHighResolution() {
+        assertImage(
+                
"/wikipedia/commons/thumb/8/85/Defaut.svg/langfr-2500px-Defaut.svg.png",
+                "/wikipedia/commons/8/85/Defaut.svg", 2500);
+    }
+
+    public void testMediaThumbLangPl() {
+        assertImage(
+                
"/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langpl-400px-First_Ionization_Energy.svg.png",
+                "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 400);
+    }
+
+    public void testMediaThumbLangPlHighResolution() {
+        assertImage(
+                
"/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langpl-4000px-First_Ionization_Energy.svg.png",
+                "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 4000);
+    }
+
+    public void testMediaThumbLangZhHans() {
+        assertImage(
+                
"/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langzh-hans-400px-First_Ionization_Energy.svg.png",
+                "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 400);
+    }
+
+    public void testMediaThumbLangZhHansHighResolution() {
+        assertImage(
+                
"/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langzh-hans-4000px-First_Ionization_Energy.svg.png",
+                "/wikipedia/commons/1/1d/First_Ionization_Energy.svg", 4000);
+    }
+
+    public void testMediaThumbLangUpperCase() {
+        
assertUnidentified("/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/langXr-400px-First_Ionization_Energy.svg.png");
+    }
+
+    public void testMediaThumbLangNumber() {
+        
assertUnidentified("/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/lang7-400px-First_Ionization_Energy.svg.png");
+    }
+
+    public void testMediaThumbLangNonALpha() {
+        
assertUnidentified("/wikipedia/commons/thumb/1/1d/First_Ionization_Energy.svg/lang?-400px-First_Ionization_Energy.svg.png");
+    }
+
+    public void testMediaThumbPaged() {
+        assertImage(
+                
"/wikipedia/commons/thumb/6/6a/DiagFuncMacroSyst.pdf/page1-450px-DiagFuncMacroSyst.pdf.jpg",
+                "/wikipedia/commons/6/6a/DiagFuncMacroSyst.pdf", 450);
+    }
+
+    public void testMediaThumbPagedLossy() {
+        assertImage(
+                
"/wikipedia/commons/thumb/0/02/1969_Afghanistan_Sistan_wind_ripples.tiff/lossy-page1-220px-1969_Afghanistan_Sistan_wind_ripples.tiff.jpg",
+                
"/wikipedia/commons/0/02/1969_Afghanistan_Sistan_wind_ripples.tiff",
+                220);
+    }
+
+    public void testMediaThumbPagedLossless() {
+        assertImage(
+                
"/wikipedia/commons/thumb/b/b2/KeyCard.tiff/lossless-page1-220px-KeyCard.tiff.png",
+                "/wikipedia/commons/b/b2/KeyCard.tiff", 220);
+    }
+
+    public void testMediaThumbDoubleDash() {
+        
assertImage("http://upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Reichelt.ogg/220px--Reichelt.ogg.jpg";,
+                "/wikipedia/commons/1/1a/Reichelt.ogg", 220);
+    }
+
+    public void testMediaThumbSeekInteger() {
+        assertImage(
+                
"/wikipedia/commons/thumb/d/df/Emu_feeding_on_grass.ogg/220px-seek%3D43-Emu_feeding_on_grass.ogg.jpg",
+                "/wikipedia/commons/d/df/Emu_feeding_on_grass.ogg", 220);
+    }
+
+    public void testMediaThumbSeekFraction() {
+        assertImage(
+                
"/wikipedia/commons/thumb/a/af/Pelecanus_occidentalis_-Jamaica_-fishing-8.ogv/220px-seek%3D2.5-Pelecanus_occidentalis_-Jamaica_-fishing-8.ogv.jpg",
+                
"/wikipedia/commons/a/af/Pelecanus_occidentalis_-Jamaica_-fishing-8.ogv",
+                220);
+    }
+
+    public void testMediaThumbTemp() {
+        assertImage(
+                
"/wikipedia/commons/thumb/temp/5/57/20141026110003%21phphHKAQ2.jpg/100px-20141026110003%21phphHKAQ2.jpg",
+                "/wikipedia/commons/temp/5/57/20141026110003!phphHKAQ2.jpg",
+                100);
+    }
+
+    // Test uploaded media files; Archive -------------------------------------
+
+    public void testMediaArchive() {
+        assertOriginal(
+                "/wikipedia/sr/archive/2/25/20121208204804!ZvezdeGranda.jpg",
+                "/wikipedia/sr/archive/2/25/20121208204804!ZvezdeGranda.jpg");
+    }
+
+    public void testMediaArchiveEncoded() {
+        assertOriginal(
+                "/wikipedia/sr/archive/2/25/20121208204804%21ZvezdeGranda.jpg",
+                "/wikipedia/sr/archive/2/25/20121208204804!ZvezdeGranda.jpg");
+    }
+
+    public void testMediaThumbArchive() {
+        assertImage(
+                
"/wikipedia/commons/thumb/archive/b/bb/20100220202202!Polska_1386_-_1434.png/120px-Polska_1386_-_1434.png",
+                
"/wikipedia/commons/archive/b/bb/20100220202202!Polska_1386_-_1434.png",
+                120);
+    }
+
+    public void testMediaThumbArchiveEncoded() {
+        assertImage(
+                
"/wikipedia/commons/thumb/archive/b/bb/20100220202202%21Polska_1386_-_1434.png/120px-Polska_1386_-_1434.png",
+                
"/wikipedia/commons/archive/b/bb/20100220202202!Polska_1386_-_1434.png",
+                120);
+    }
+
+    // Test uploaded media files; Transcoded ----------------------------------
+
+    public void testMediaTranscodedWebM() {
+        assertMovie(
+                
"/wikipedia/commons/transcoded/3/31/Lheure_du_foo.ogv/Lheure_du_foo.ogv.360p.webm",
+                "/wikipedia/commons/3/31/Lheure_du_foo.ogv", 360);
+    }
+
+    public void testMediaTranscodedWebMHighResolution() {
+        assertMovie(
+                
"/wikipedia/commons/transcoded/3/31/Lheure_du_foo.ogv/Lheure_du_foo.ogv.480p.webm",
+                "/wikipedia/commons/3/31/Lheure_du_foo.ogv",
+                480);
+    }
+
+    public void testMediaTranscodedOgv() {
+        assertMovie(
+                
"/wikipedia/commons/transcoded/d/d3/Yvonne_Strahovski_about_her_acting_career.webm/Yvonne_Strahovski_about_her_acting_career.webm.360p.ogv",
+                
"/wikipedia/commons/d/d3/Yvonne_Strahovski_about_her_acting_career.webm",
+                360);
+    }
+
+    public void testMediaTranscodedOgvHighResolution() {
+        assertMovie(
+                
"/wikipedia/commons/transcoded/d/d3/Yvonne_Strahovski_about_her_acting_career.webm/Yvonne_Strahovski_about_her_acting_career.webm.480p.ogv",
+                
"/wikipedia/commons/d/d3/Yvonne_Strahovski_about_her_acting_career.webm",
+                480);
+    }
+
+    public void testMediaTranscodedOgg() {
+        assertAudio(
+                
"/wikipedia/commons/transcoded/b/bd/Xylophone_jingle.wav/Xylophone_jingle.wav.ogg",
+                "/wikipedia/commons/b/bd/Xylophone_jingle.wav");
+    }
+}
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
index 733cb20..9bd5905 100644
--- 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java
@@ -22,7 +22,7 @@
 
 /**
  * A Hive UDF to identify what requests constitute "pageviews",
- * according to the definition at 
+ * according to the definition at
  * 
https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
  * This is the "legacy" definition, in use by WebStatsCollector and the
  * pageviews dumps at http://dumps.wikimedia.org/other/pagecounts-ez/
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java
new file mode 100644
index 0000000..88e3a50
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/MediaFileUrlParserUDF.java
@@ -0,0 +1,165 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+import org.wikimedia.analytics.refinery.core.MediaFileUrlParser;
+import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo;
+import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo.Classification;
+import org.wikimedia.analytics.refinery.core.PercentEncoder;
+
+import java.util.LinkedList;
+import java.util.List;
+
+// "deterministic" is the default anyways, but we want to make it visible,
+// hence we explicitly set it.:
+@UDFType(deterministic = true)
+@Description(name = "parse_media_file_url",
+    value = "_FUNC_(url) - Returns a map of details to a media file url",
+    extended = "argument 0 is the url to analyze")
+public class MediaFileUrlParserUDF extends GenericUDF {
+    private Object[] result;
+
+    private StringObjectInspector inputOI;
+
+    private int IDX_BASE_NAME;
+    private int IDX_IS_ORIGINAL;
+    private int IDX_IS_TRANSCODED_AUDIO;
+    private int IDX_IS_TRANSCODED_IMAGE;
+    private int IDX_IS_TRANSCODED_MOVIE;
+    private int IDX_WIDTH;
+    private int IDX_HEIGHT;
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] arguments)
+            throws UDFArgumentException {
+        // We need exactly 1 parameter
+        if (arguments == null || arguments.length != 1) {
+            throw new UDFArgumentLengthException("The function "
+                    + "ParseMediaFileUrlUDF expects exactly 1 parameter");
+        }
+
+        // ... and the parameter has to be a string
+        if (!(arguments[0] instanceof StringObjectInspector)) {
+            throw new UDFArgumentTypeException(0, "The parameter to "
+                    + "ParseMediaFileUrlUDF has to be a string");
+        }
+
+        inputOI = (StringObjectInspector) arguments[0];
+
+        List<String> fieldNames = new LinkedList<String>();
+        List<ObjectInspector> fieldOIs= new LinkedList<ObjectInspector>();
+        int idx = 0;
+
+        fieldNames.add("base_name");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+        IDX_BASE_NAME=idx++;
+
+        fieldNames.add("is_original");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
+        IDX_IS_ORIGINAL=idx++;
+
+        fieldNames.add("is_transcoded_to_audio");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
+        IDX_IS_TRANSCODED_AUDIO=idx++;
+
+        fieldNames.add("is_transcoded_to_image");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
+        IDX_IS_TRANSCODED_IMAGE=idx++;
+
+        fieldNames.add("is_transcoded_to_movie");
+        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
+        IDX_IS_TRANSCODED_MOVIE=idx++;
+
+        fieldNames.add("width");
+        fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
+        IDX_WIDTH=idx++;
+
+        fieldNames.add("height");
+        fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector);
+        IDX_HEIGHT=idx++;
+
+        result = new Object[idx];
+
+        return 
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
+    }
+
+    @Override
+    public Object evaluate(DeferredObject[] arguments) throws HiveException {
+        assert arguments != null : "Method 'evaluate' of ParseMediaFileUrlUDF "
+                + "called with null arguments array";
+        assert arguments.length == 1 : "Method 'evaluate' of "
+                + "ParseMediaFileUrlUDF called arguments of length "
+                + arguments.length + " (instead of 1)";
+        // arguments is an array with exactly 1 entry.
+
+        assert result != null : "Result object has not yet been initialized, "
+                + "but evaluate called";
+        // result object has been initialized. So it's an array of objects of
+        // the right length.
+
+        String url = inputOI.getPrimitiveJavaObject(arguments[0].get());
+
+        MediaFileUrlInfo info = MediaFileUrlParser.parse(url);
+
+        if (info == null) {
+            result[IDX_BASE_NAME] = null;
+
+            result[IDX_IS_ORIGINAL] = false;
+            result[IDX_IS_TRANSCODED_AUDIO] = false;
+            result[IDX_IS_TRANSCODED_IMAGE] = false;
+            result[IDX_IS_TRANSCODED_MOVIE] = false;
+
+            result[IDX_WIDTH] = null;
+            result[IDX_HEIGHT] = null;
+        } else {
+            result[IDX_BASE_NAME] = PercentEncoder.encode(info.getBaseName());
+
+            Classification classification = info.getClassification();
+            result[IDX_IS_ORIGINAL] = (classification == 
Classification.ORIGINAL);
+            result[IDX_IS_TRANSCODED_AUDIO] = (classification == 
Classification.TRANSCODED_TO_AUDIO);
+            result[IDX_IS_TRANSCODED_IMAGE] = (classification == 
Classification.TRANSCODED_TO_IMAGE);
+            result[IDX_IS_TRANSCODED_MOVIE] = (classification == 
Classification.TRANSCODED_TO_MOVIE);
+
+            result[IDX_WIDTH] = info.getWidth();
+            result[IDX_HEIGHT] = info.getHeight();
+        }
+
+        return result;
+    }
+
+    @Override
+    public String getDisplayString(String[] arguments) {
+        String argument;
+        if (arguments == null) {
+            argument = "<arguments == null>";
+        } else if (arguments.length == 1) {
+            argument = arguments[0];
+        } else {
+            argument = "<arguments of length " + arguments.length + ">";
+        }
+        return "parse_media_file_url(" + argument +")";
+    }
+}
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java
new file mode 100644
index 0000000..8a6694e
--- /dev/null
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestMediaFileUrlParserUDF.java
@@ -0,0 +1,156 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.hive;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.wikimedia.analytics.refinery.core.MediaFileUrlInfo.Classification;
+
+import junit.framework.TestCase;
+
+public class TestMediaFileUrlParserUDF extends TestCase {
+    ObjectInspector StringOI = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+    ObjectInspector LongOI = 
PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+
+    private Object callUDF(String url) throws HiveException, IOException {
+        DeferredObject urlDO = new DeferredJavaObject(url);
+        DeferredObject[] arguments = new DeferredObject[] {urlDO};
+        Object res = null;
+
+        MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF();
+        try {
+            udf.initialize(new ObjectInspector[]{StringOI});
+            res = udf.evaluate(arguments);
+        } finally {
+            udf.close();
+        }
+        return res;
+    }
+
+    private void assertOutput(String url, String baseName,
+            Classification classification, Integer width, Integer height)
+            throws HiveException, IOException {
+        Object[] res = (Object[]) callUDF(url);
+
+        assertEquals("Result array has wrong length", 7, res.length);
+
+        assertEquals("baseName does not match", baseName, res[0]);
+
+        assertEquals("is_original does not match", classification == 
Classification.ORIGINAL, res[1]);
+        assertEquals("is_high_quality does not match", classification == 
Classification.TRANSCODED_TO_AUDIO, res[2]);
+        assertEquals("is_low_quality does not match", classification == 
Classification.TRANSCODED_TO_IMAGE, res[3]);
+        assertEquals("is_low_quality does not match", classification == 
Classification.TRANSCODED_TO_MOVIE, res[4]);
+
+        if (width == null) {
+            assertNull("width is not null", res[5]);
+        } else {
+            assertEquals("width does not match", width, res[5]);
+        }
+
+        if (height == null) {
+            assertNull("height is not null", res[6]);
+        } else {
+            assertEquals("height does not match", height, res[6]);
+        }
+    }
+
+    public void testInitialize() throws HiveException, IOException {
+        MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF();
+        try {
+            udf.initialize(new ObjectInspector[]{StringOI});
+        } finally {
+            udf.close();
+        }
+    }
+
+    public void testInitializeEmpty() throws HiveException, IOException {
+        MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF();
+        try {
+            udf.initialize(new ObjectInspector[]{});
+            fail("Initialize did not throw HiveException");
+        } catch (HiveException e) {
+        } finally {
+            udf.close();
+        }
+    }
+
+    public void testInitializeWrongType() throws HiveException, IOException {
+        MediaFileUrlParserUDF udf = new MediaFileUrlParserUDF();
+        try {
+            udf.initialize(new ObjectInspector[]{LongOI});
+            fail("Initialize did not throw HiveException");
+        } catch (HiveException e) {
+        } finally {
+            udf.close();
+        }
+    }
+
+    public void testEvaluateUnknown() throws HiveException, IOException {
+        assertOutput("foo", null, Classification.UNKNOWN, null, null);
+    }
+
+    public void testEvaluateOriginal() throws HiveException, IOException {
+        assertOutput("/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png",
+                "/math/d/a/9/da9d325123d50dbc4e36363f2863ce3e.png",
+                Classification.ORIGINAL, null, null);
+    }
+
+    public void testEvaluateAudio() throws HiveException, IOException {
+        assertOutput(
+                
"/wikipedia/commons/transcoded/b/bd/Xylophone_jingle.wav/Xylophone_jingle.wav.ogg",
+                "/wikipedia/commons/b/bd/Xylophone_jingle.wav",
+                Classification.TRANSCODED_TO_AUDIO, null, null);
+    }
+
+    public void testEvaluateImageWithWidth() throws HiveException, IOException 
{
+        assertOutput(
+                
"/wikipedia/commons/thumb/a/ae/Flag_of_the_United_Kingdom.svg/1024px-Flag_of_the_United_Kingdom.svg.png",
+                "/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg",
+                Classification.TRANSCODED_TO_IMAGE, 1024, null);
+    }
+
+    public void testEvaluateImageWithoutWidth() throws HiveException, 
IOException {
+        assertOutput(
+                
"/wikipedia/commons/thumb/a/ae/Flag_of_the_United_Kingdom.svg/mid-Flag_of_the_United_Kingdom.svg.png",
+                "/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg",
+                Classification.TRANSCODED_TO_IMAGE, null, null);
+    }
+
+    public void testEvaluateMovie() throws HiveException, IOException {
+        assertOutput(
+                
"/wikipedia/commons/transcoded/3/31/Lheure_du_foo.ogv/Lheure_du_foo.ogv.360p.webm",
+                "/wikipedia/commons/3/31/Lheure_du_foo.ogv",
+                Classification.TRANSCODED_TO_MOVIE, null, 360);
+    }
+
+    public void testEncodedInput() throws HiveException, IOException {
+        assertOutput(
+                "/wikipedia/commons/a/ae/F%6F%6f.svg",
+                "/wikipedia/commons/a/ae/Foo.svg",
+                Classification.ORIGINAL, null, null);
+    }
+
+    public void testEncodedOutput() throws HiveException, IOException {
+        assertOutput(
+                "/wikipedia/commons/a/ae/F\no—o.svg",
+                "/wikipedia/commons/a/ae/F%0Ao%E2%80%94o.svg",
+                Classification.ORIGINAL, null, null);
+    }
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/189981
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1b76e1e331ea781aee13557fc55a2c19ce5744a7
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to