<face_palm/>
Still couldn't find any problems with actual multithreaded code. :(
@Test
public void testMultiThreadingEncodingDetection() throws Exception {
Path testDocs =
Paths.get(this.getClass().getResource("/test-documents").toURI());
List<Path> paths = new ArrayList<>();
Map<Path, String> encodings = new ConcurrentHashMap<>();
for (File file : testDocs.toFile().listFiles()) {
if (file.getName().endsWith(".txt") ||
file.getName().endsWith(".html")) {
System.out.println(file);
String encoding = getEncoding(file.toPath());
paths.add(file.toPath());
encodings.put(file.toPath(), encoding);
}
}
int numThreads = 100;
ExecutorService ex = Executors.newFixedThreadPool(numThreads);
CompletionService<String> completionService =
new ExecutorCompletionService<>(ex);
for (int i = 0; i < numThreads; i++) {
completionService.submit(new EncodingDetector(paths, encodings),
"done");
}
int completed = 0;
while (completed < numThreads) {
Future<String> future = completionService.take();
if (future.isDone() && "done".equals(future.get())) {
completed++;
}
}
assertTrue("success!", true);
}
private class EncodingDetector implements Runnable {
private final List<Path> paths;
private final Map<Path, String> encodings;
private final Random r = new Random();
private EncodingDetector(List<Path> paths, Map<Path, String> encodings)
{
this.paths = paths;
this.encodings = encodings;
}
@Override
public void run() {
for (int i = 0; i < 1000; i++) {
int pInd = r.nextInt(paths.size());
String detectedEncoding = null;
try {
detectedEncoding = getEncoding(paths.get(pInd));
} catch (Exception e) {
throw new RuntimeException(e);
}
String trueEncoding = encodings.get(paths.get(pInd));
if (! detectedEncoding.equals(trueEncoding)) {
throw new RuntimeException("detected: " + detectedEncoding +
" but should have been: "+trueEncoding);
}
}
}
}
public String getEncoding(Path p) throws Exception {
try (InputStream is = TikaInputStream.get(p)) {
AutoDetectReader reader = new AutoDetectReader(is);
String val = reader.getCharset().toString();
if (val == null) {
return "NULL";
} else {
return val;
}
}
}
-----Original Message-----
From: Allison, Timothy B. [mailto:[email protected]]
Sent: Monday, July 25, 2016 10:17 PM
To: [email protected]
Subject: RE: Is Tika (especially CharsetDetector) considered thread-safe?
With 1.13 and this code, I'm not able to see any problems with our handful of
test files in our unit tests.
Exactly what code are you using? How are you doing detection?