<face_palm/>

Still couldn't find any problems with actual multithreaded code. :(

    @Test
    public void testMultiThreadingEncodingDetection() throws Exception {
        Path testDocs = 
Paths.get(this.getClass().getResource("/test-documents").toURI());
        List<Path> paths = new ArrayList<>();
        Map<Path, String> encodings = new ConcurrentHashMap<>();
        for (File file : testDocs.toFile().listFiles()) {
            if (file.getName().endsWith(".txt") || 
file.getName().endsWith(".html")) {
                    System.out.println(file);
                String encoding = getEncoding(file.toPath());
                paths.add(file.toPath());
                encodings.put(file.toPath(), encoding);
            }
        }
        int numThreads = 100;
        ExecutorService ex = Executors.newFixedThreadPool(numThreads);
        CompletionService<String> completionService =
                new ExecutorCompletionService<>(ex);

        for (int i = 0; i < numThreads; i++) {
            completionService.submit(new EncodingDetector(paths, encodings), 
"done");
        }
        int completed = 0;
        while (completed < numThreads) {
            Future<String> future = completionService.take();
            if (future.isDone() && "done".equals(future.get())) {
                completed++;
            }
        }
        assertTrue("success!", true);
    }

    private class EncodingDetector implements Runnable {
        private final List<Path> paths;
        private final Map<Path, String> encodings;
        private final Random r = new Random();
        private EncodingDetector(List<Path> paths, Map<Path, String> encodings) 
{
            this.paths = paths;
            this.encodings = encodings;
        }

        @Override
        public void run() {
            for (int i = 0; i < 1000; i++) {
                int pInd = r.nextInt(paths.size());

                String detectedEncoding = null;
                try {
                    detectedEncoding = getEncoding(paths.get(pInd));
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
                String trueEncoding = encodings.get(paths.get(pInd));
                if (! detectedEncoding.equals(trueEncoding)) {
                    throw new RuntimeException("detected: " + detectedEncoding +
                            " but should have been: "+trueEncoding);
                }
            }
        }
    }

    public String getEncoding(Path p) throws Exception {
        try (InputStream is = TikaInputStream.get(p)) {
            AutoDetectReader reader = new AutoDetectReader(is);
            String val = reader.getCharset().toString();
            if (val == null) {
                return "NULL";
            } else {
                return val;
            }
        }
    }

-----Original Message-----
From: Allison, Timothy B. [mailto:[email protected]] 
Sent: Monday, July 25, 2016 10:17 PM
To: [email protected]
Subject: RE: Is Tika (especially CharsetDetector) considered thread-safe?

With 1.13 and this code, I'm not able to see any problems with our handful of 
test files in our unit tests.  

Exactly what code are you using?  How are you doing detection?

Reply via email to