Author: nick Date: Tue Dec 27 03:00:53 2011 New Revision: 1224865 URL: http://svn.apache.org/viewvc?rev=1224865&view=rev Log: TIKA-793 Correct the null termination stripping in the ID3 tag code, when dealing with double byte encoded strings
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=1224865&r1=1224864&r2=1224865&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Tue Dec 27 03:00:53 2011 @@ -182,15 +182,16 @@ public class ID3v2Frame implements MP3Fr */ protected static String getTagString(byte[] data, int offset, int length) { int actualLength = length; - while (actualLength > 0 && data[actualLength-1] == 0) { - actualLength--; - } if (actualLength == 0) { return ""; } + if (actualLength == 1 && data[offset] == 0) { + return ""; + } // Does it have an encoding flag? // Detect by the first byte being sub 0x20 + boolean doubleByte = false; String encoding = "ISO-8859-1"; byte maybeEncodingFlag = data[offset]; if (maybeEncodingFlag == 0 || maybeEncodingFlag == 1 || @@ -200,15 +201,29 @@ public class ID3v2Frame implements MP3Fr if (maybeEncodingFlag == 1) { // With BOM encoding = "UTF-16"; + doubleByte = true; } else if (maybeEncodingFlag == 2) { // Without BOM encoding = "UTF-16BE"; + doubleByte = true; } else if (maybeEncodingFlag == 3) { encoding = "UTF8"; } } + + // Trim off null termination / padding (as present) + while (doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) { + actualLength -= 2; + } + while (!doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) { + actualLength--; + } + if (actualLength == 0) { + return ""; + } try { + // Build the base string return new String(data, offset, actualLength, encoding); } catch (UnsupportedEncodingException e) { throw new RuntimeException(