To be sure, I've added tests with byte-order marks, and this doesn't
influence the result:
@Test
public void testDetect() throws IOException {
final Detector detector = new Tika().getDetector();
final Metadata metadata = new Metadata();
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("<data>42</data>".getBytes(StandardCharsets.US_ASCII))))
{
assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
}
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("<?xml?><data>42</data>".getBytes(StandardCharsets.US_ASCII))))
{
assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
}
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("<?xml
version='1.0'?><data>42</data>".getBytes(StandardCharsets.US_ASCII)))) {
assertEquals(MediaType.APPLICATION_XML, detector.detect(in,
metadata).getBaseType());
}
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<?xml
version='1.0'?><data>42</data>".getBytes(StandardCharsets.UTF_8)))) {
// UTF-8 BOM, follweod by the '<' char:
assertEquals(0xEF, in.read());
assertEquals(0xBB, in.read());
assertEquals(0xBF, in.read());
assertEquals('<', in.read());
}
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<data>42</data>".getBytes(StandardCharsets.UTF_8))))
{
assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
}
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<?xml?><data>42</data>".getBytes(StandardCharsets.UTF_8))))
{
assertEquals(MediaType.TEXT_PLAIN, detector.detect(in,
metadata).getBaseType());
}
try (final InputStream in = new BufferedInputStream(new
ByteArrayInputStream("\uFEFF<?xml
version='1.0'?><data>42</data>".getBytes(StandardCharsets.UTF_8)))) {
assertEquals(MediaType.APPLICATION_XML, detector.detect(in,
metadata).getBaseType());
}
}
–John