[ https://issues.apache.org/jira/browse/TIKA-2041?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15394007#comment-15394007 ]
Tim Allison commented on TIKA-2041: ----------------------------------- Thank you for the confirmation. Unless my colleagues object, I'll re-copy/paste the classes that we use from ICU4J from the latest release (or trunk?). That should fix this. > Charset detection doesn't appear to be thread-safe > -------------------------------------------------- > > Key: TIKA-2041 > URL: https://issues.apache.org/jira/browse/TIKA-2041 > Project: Tika > Issue Type: Bug > Reporter: Tim Allison > > On the user list, Christian Leitinger noted that his team found a potential > issue with the thread safety of the encoding detector. I was able to > reproduce this with on the corpus of html files in [~faghani]'s encoding > detector. > {noformat} > @Test > public void testMultiThreadingEncodingDetection() throws Exception { > Path testDocs = Paths.get("C:/data/encodings/corpus"); > List<Path> paths = new ArrayList<>(); > Map<Path, String> encodings = new ConcurrentHashMap<>(); > for (File encodingDirs : testDocs.toFile().listFiles()) { > for (File file : encodingDirs.listFiles()) { > String encoding = getEncoding(file.toPath()); > paths.add(file.toPath()); > encodings.put(file.toPath(), encoding); > } > } > int numThreads = 1000; > ExecutorService ex = Executors.newFixedThreadPool(numThreads); > CompletionService<String> completionService = > new ExecutorCompletionService<>(ex); > for (int i = 0; i < numThreads; i++) { > completionService.submit(new EncodingDetectorRunner(paths, > encodings), "done"); > } > int completed = 0; > while (completed < numThreads) { > Future<String> future = completionService.take(); > if (future.isDone() && "done".equals(future.get())) { > completed++; > } > } > assertTrue("success!", true); > } > private class EncodingDetectorRunner implements Runnable { > private final List<Path> paths; > private final Map<Path, String> encodings; > private final Random r = new Random(); > private EncodingDetectorRunner(List<Path> paths, Map<Path, String> > encodings) { > this.paths = paths; > this.encodings = encodings; > } > @Override > public void run() { > for (int i = 0; i < 100; i++) { > int pInd = r.nextInt(paths.size()); > String detectedEncoding = null; > try { > detectedEncoding = getEncoding(paths.get(pInd)); > } catch (Exception e) { > throw new RuntimeException(e); > } > String trueEncoding = encodings.get(paths.get(pInd)); > if (! detectedEncoding.equals(trueEncoding)) { > throw new RuntimeException("detected: " + > detectedEncoding + > " but should have been: "+trueEncoding + " for " > + paths.get(pInd)); > } > } > } > } > public String getEncoding(Path p) throws Exception { > try (InputStream is = TikaInputStream.get(p)) { > AutoDetectReader reader = new AutoDetectReader(is); > String val = reader.getCharset().toString(); > if (val == null) { > return "NULL"; > } else { > return val; > } > } > } > {noformat} > yields: > {noformat} > ava.util.concurrent.ExecutionException: java.lang.RuntimeException: detected: > ISO-8859-1 but should have been: windows-1252 for > C:\data\encodings\corpus\Shift_JIS\1 > at java.util.concurrent.FutureTask.report(FutureTask.java:122) > at java.util.concurrent.FutureTask.get(FutureTask.java:192) > at > org.apache.tika.parser.html.HtmlParserTest.testMultiThreadingEncodingDetection(HtmlParserTest.java:1213) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332)