[1/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging
Repository: tika Updated Branches: refs/heads/master a2d214c71 -> dc2dcd4cc http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/log4j.properties -- diff --git a/tika-eval/src/main/resources/log4j.properties b/tika-eval/src/main/resources/log4j.properties new file mode 100644 index 000..925f9f2 --- /dev/null +++ b/tika-eval/src/main/resources/log4j.properties @@ -0,0 +1,11 @@ + +log4j.rootLogger=WARN,A1 + +#for debugging +#log4j.rootLogger=TRACE,A1 + +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/tika-eval-comparison-config.xml -- diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml index 04ef658..88fdd0a 100644 --- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml +++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml @@ -28,8 +28,6 @@ > - - @@ -72,7 +68,7 @@ crawlingInputDir="false" minJsonFileSizeBytes="-1" maxJsonFileSizeBytes="200" - commonTokens="resources/commontokens" + commonTokens="resources/common_tokens" /> http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/tika-eval-profiler-config.xml -- diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml index bd94b25..be7adf4 100644 --- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml +++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml @@ -27,16 +27,13 @@ timeoutThresholdMillis="30"> - - - @@ -66,7 +63,7 @@ + commonTokens="resources/common_tokens"/> http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java -- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java index 72e8008..6d4d4ef 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java @@ -59,7 +59,7 @@ public class SimpleComparerTest extends TikaTest { Paths.get("extractsA"), Paths.get("extractsB"), writer, -1, -1, ExtractReader.ALTER_METADATA_LIST.AS_IS); - AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/commontokens").toPath()); + AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath()); LanguageIDWrapper.loadBuiltInModels(); } http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java -- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java index c358149..ff0961c 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java @@ -30,7 +30,7 @@ public class TikaEvalCLITest { public void testBasic() throws Exception { List args = new ArrayList<>(); args.add("Profile"); -args.add("-extractDir"); +args.add("-extracts"); args.add("tika"); args.add("-db"); args.add("mydb"); http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/resources/common_tokens/en -- diff --git a/tika-eval/src/test/resources/common_tokens/en b/tika-eval/src/test/resources/common_tokens/en new file mode 100644 index 000..8d442fe --- /dev/null +++ b/tika-eval/src/test/resources/common_tokens/en @@ -0,0 +1,8 @@ +the +of +and +a +or +#quick +brown +fox \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/resources/common_tokens/es -- diff --git a/tika-eval/src/test/resources/common_tokens/es b/tika-eval/src/test/resources/common_tokens/es new file mode 100644 index 000..b9bfd03 --- /dev/null +++
[2/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging
http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/common_tokens/es -- diff --git a/tika-eval/src/main/resources/common_tokens/es b/tika-eval/src/main/resources/common_tokens/es new file mode 100644 index 000..2889e7c --- /dev/null +++ b/tika-eval/src/main/resources/common_tokens/es @@ -0,0 +1,19997 @@ +como +para +tambien +esta +entre +este +desde +anos +hasta +parte +donde +sobre +___url___ +durante +pero +historia +tiene +vease +primera +nombre +ciudad +despues +cuando +otros +gran +encuentra +cual +familia +primer +estado +mismo +estados +solo +siglo +tres +otras +unidos +segun +ademas +habia +forma +puede +fueron +nacional +bajo +espana +aunque +poblacion +lugar +junto +sido +mayor +tras +todo +oficial +ingles +tiempo +siendo +nueva +todos +estaba +general +antes +vida +tenia +nuevo +hacia +grupo +tanto +segunda +quien +cada +algunos +mientras +personas +embargo +varios +norte +espanol +julio +habitantes +debido +total +estos +provincia +luego +estadounidense +tuvo +bibliografia +eran +mundo +contra +otro +centro +region +cuatro +mayo +principal +entonces +pais +conocido +universidad +trabajo +cuenta +unos +septiembre +enero +octubre +guerra +marzo +serie +segundo +junio +diciembre +final +otra +dentro +agosto +especie +abril +paso +noviembre +ellos +internacional +tierra +actualmente +numero +varias +esto +algunas +hace +juan +poco +hizo +carrera +estas +traves +estan +febrero +jose +biografia +muchos +partir +medio +demografia +estudio +grandes +mejor +agua +misma +sitio +largo +caso +equipo +tarde +siguiente +obra +tipo +gobierno +publico +todas +bien +ultimo +llamado +posteriormente +primeros +geografia +mundial +pueden +casa +cualquier +sistema +maria +zona +importante +momento +desarrollo +real +frente +epoca +hijo +menos +comenzo +hecho +actual +ella +toda +obras +muerte +diferentes +termino +departamento +orden +central +partido +titulo +municipio +veces +cerca +argentina +casi +base +principales +origen +lista +cuales +dias +tienen +francia +periodo +edad +siguientes +presidente +padre +mayoria +manera +finalmente +pagina +distrito +estudios +cinco +localidad +hacer +punto +alrededor +superficie +haber +capital +club +carlos +nivel +censo +santa +pueblo +mexico +poder +espanola +condado +finales +ante +ubicado +nacio +distribucion +cambio +america +republica +ambos +fuera +conocida +tener +genero +television +director +llego +puesto +mediante +ejemplo +musica +caracteristicas +ultima +luis +pesar +europa +siempre +estuvo +lado +llamada +escuela +miembro +cargo +paises +importantes +inicio +area +linea +primero +muchas +alto +reino +futbol +mucho +edicion +incluso +politica +publicado +propio +produccion +oficina +francisco +oeste +popular +convirtio +notas +division +media +antonio +encuentran +version +temporada +cabo +premio +miembros +madrid +iglesia +original +union +frances +francesa +exito +superior +construccion +unico +principalmente +antiguo +recibio +porque +pelicula +libro +alli +informacion +direccion +igual +densidad +grande +sino +debe +campo +hombres +banda +compuesto +acuerdo +seria +politico +anterior +relacion +proyecto +decada +ello +seis +alta +menor +estilo +metros +principios +cuerpo +resto +posicion +meses +realizo +incluyendo +datos +ahora +cultura +mujeres +categoria +situada +servicio +york +programa +sede +militar +especial +comunidad +perteneciente +especialmente +unas +antigua +profesional +sociedad +interior +ellas +habian +nacido +coordenadas +alemania +instituto +obtuvo +pedro +natural +descripcion +local +trata +proceso +social +conjunto +organizacion +costa +pequeno +incluye +gracias +vista +album +actualidad +compania +aparece +estaban +participo +propia +estar +tercera +honor +formacion +ultimos +comun +color +joven +llegar +personal +italia +nombrado +revista +nunca +diversos +papel +madre +arte +pacifico +hombre +john +dicho +unica +pues +tercer +grupos +decir +manuel +especies +creacion +isla +situado +referencia +cuyo +primeras +fecha +posible +modo +interes +hijos +lugares +derecho +unido +canciones +pequena +buenos +cine +situacion +mujer +poblacional +cancion +muestra +gano +medios +territorio +ubicada +resultado +movimiento +millones +problemas +llevo +actividad +paris +blanco +martin +miguel +hija +publica +servicios +chile +nuevos +altura +civil +existen +algo +autor +principio +blancos +empresa +plantas +hermano +ejercito +diversas +ocasiones +aproximadamente +libre +partidos +dado +musical +campeonato +regreso +trayectoria +puntos +partes +apoyo +baja +cantidad +lleva +tenian +dicha +aleman +nacionales +causa +victoria +existe +nuevas +posee +isbn +teatro +control +geografica +personajes +premios +ciudades +cuya +asociacion +murio +juegos +similar +clubes +santiago +participacion +presencia +seleccion +presenta +decidio +museo +volvio +siete +liga +fundacion +logro +actividades +equipos +raza +espacio +fuerza +objetivo +cultural +fuerte
[3/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging
http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/common_tokens/en -- diff --git a/tika-eval/src/main/resources/common_tokens/en b/tika-eval/src/main/resources/common_tokens/en new file mode 100644 index 000..7426945 --- /dev/null +++ b/tika-eval/src/main/resources/common_tokens/en @@ -0,0 +1,2 @@ +with +from +which +that +also +this +were +first +other +after +been +have +when +their +more +there +into +time +over +they +during +years +most +known +only +some +made +including +___url___ +between +under +where +about +part +later +many +three +history +such +used +then +than +united +well +while +both +being +early +states +through +year +american +became +them +these +name +called +however +before +since +would +several +until +world +second +people +following +same +high +city +area +born +four +number +life +national +family +based +north +state +named +south +those +although +because +another +work +long +like +around +each +according +former +place +along +major +line +john +still +general +large +group +small +began +school +found +will +within +located +much +west +include +often +back +very +five +last +could +present +home +against +march +main +together +june +public +series +january +october +september +great +even +july +among +every +took +included +left +late +april +best +what +just +east +system +century +down +order +times +original +august +near +december +white +member +november +become +local +house +university +total +held +third +children +given +different +government +make +various +having +death +land +international +without +british +population +received +company +though +died +using +members +english +again +married +february +county +town +built +single +considered +point +created +came +service +served +popular +next +take +established +period +similar +once +others +originally +short +central +said +york +full +career +side +making +further +published +living +released +moved +common +development +water +never +modern +important +show +power +below +book +went +result +country +support +little +music +less +example +role +continued +played +produced +written +western +addition +upon +film +days +final +developed +river +size +term +throughout +president +post +formed +right +black +special +started +half +current +either +community +founded +young +eventually +instead +usually +good +center +office +seen +thus +taken +control +lost +sometimes +works +rather +open +william +free +father +returned +french +average +almost +college +does +political +level +education +district +notable +james +live +older +followed +george +england +version +wife +television +across +king +party +position +despite +female +northern +production +working +described +himself +building +southern +case +seven +america +play +available +includes +close +million +team +wrote +today +list +largest +areas +should +record +whose +above +region +park +worked +generally +appeared +especially +itself +middle +away +london +native +german +leading +remained +months +joined +square +least +personal +events +military +gave +outside +road +return +beginning +david +currently +refer +site +units +robert +business +lead +alone +parts +possible +provided +station +field +soon +official +class +church +opened +europe +force +features +union +able +change +army +must +related +european +eastern +royal +replaced +story +help +race +individuals +association +brought +families +means +street +come +independent +range +summer +involved +society +designed +eight +character +referred +changed +announced +light +award +significant +ever +kingdom +council +famous +earlier +night +charles +island +private +previously +future +introduced +process +services +type +added +geography +recorded +director +project +provide +successful +france +uses +lower +spread +sold +information +husband +court +previous +social +language +canada +program +limited +village +human +african +fact +particularly +hall +game +taking +interest +design +culture +action +chief +research +completed +census +real +prior +numerous +band +paul +civil +lived +media +song +allowed +recent +additional +department +season +construction +whom +success +higher +radio +longer +complete +records +featured +initially +primary +housing +places +certain +club +terms +battle +groups +thomas +behind +hand +rock +better +board +already +traditional +finally +particular +decided +release +placed +associated +required +entire +always +notes +standard +give +forces +science +someone +mother +front +sent +mostly +past +fire +here +approximately +rest +elected +active +star +performance +remains +natural +income +wide +shows +space +location +professional +start +thought +fourth +michael +density +study +larger +california +performed +germany +richard +cross +playing +caused +daughter +turn +leader +division +word +brother +nearly +love +commercial +appointed +player
[4/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging
TIKA-1332 -- add English/Spanish common tokens, fix logging Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dc2dcd4c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dc2dcd4c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dc2dcd4c Branch: refs/heads/master Commit: dc2dcd4ccc7bca640bb362f72729d0b6ba22a890 Parents: a2d214c Author: tballisonAuthored: Thu Feb 16 20:13:07 2017 -0500 Committer: tballison Committed: Thu Feb 16 20:13:07 2017 -0500 -- .../org/apache/tika/eval/AbstractProfiler.java |17 +- .../org/apache/tika/eval/ExtractProfiler.java |24 +- .../java/org/apache/tika/eval/TikaEvalCLI.java |22 +- .../tika/eval/batch/EvalConsumersBuilder.java | 2 +- .../eval/batch/SingleFileConsumerBuilder.java |18 +- .../eval/tokens/CommonTokenCountManager.java|75 +- tika-eval/src/main/resources/common_tokens/en | 2 + tika-eval/src/main/resources/common_tokens/es | 19997 tika-eval/src/main/resources/log4j.properties |11 + .../resources/tika-eval-comparison-config.xml | 6 +- .../resources/tika-eval-profiler-config.xml | 7 +- .../apache/tika/eval/SimpleComparerTest.java| 2 +- .../org/apache/tika/eval/TikaEvalCLITest.java | 2 +- tika-eval/src/test/resources/common_tokens/en | 8 + tika-eval/src/test/resources/common_tokens/es |10 + .../src/test/resources/common_tokens/zh-cn | 8 + .../src/test/resources/common_tokens/zh-tw | 8 + tika-eval/src/test/resources/commontokens/en| 8 - tika-eval/src/test/resources/commontokens/es|10 - tika-eval/src/test/resources/commontokens/zh-cn | 8 - tika-eval/src/test/resources/commontokens/zh-tw | 8 - tika-eval/src/test/resources/log4j.properties |11 - .../src/test/resources/log4j_process.properties |11 - ...ingle-file-profiler-crawl-extract-config.xml | 4 +- .../single-file-profiler-crawl-input-config.xml | 4 +- 25 files changed, 40143 insertions(+), 138 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index 24f7358..daa964a 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -158,6 +158,11 @@ public abstract class AbstractProfiler extends FileResourceConsumer { final LanguageIDWrapper langIder; protected IDBWriter writer; +/** + * + * @param p path to the common_tokens directory. If this is null, try to load from classPath + * @throws IOException + */ public static void loadCommonTokens(Path p) throws IOException { commonTokenCountManager = new CommonTokenCountManager(p); } @@ -536,29 +541,29 @@ public abstract class AbstractProfiler extends FileResourceConsumer { /** * * @param metadata - * @param extractDir + * @param extracts * @return evalfilepaths for files if crawling an extract directory */ protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, - Path extractDir) { + Path extracts) { String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH); Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath); Path relativeSourceFilePath = Paths.get(m.replaceAll("")); //just try slapping the relextractfilepath on the extractdir -Path extractFile = extractDir.resolve(relExtractFilePath); +Path extractFile = extracts.resolve(relExtractFilePath); if (! Files.isRegularFile(extractFile)) { //if that doesn't work, try to find the right extract file. //This is necessary if crawling extractsA and trying to find a file in //extractsB that is not in the same format: json vs txt or compressed -extractFile = findFile(extractDir, relativeSourceFilePath); +extractFile = findFile(extracts, relativeSourceFilePath); } return new EvalFilePaths(relativeSourceFilePath, extractFile); } //call this if the crawler is crawling through the src directory protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, - Path extractDir) { + Path extracts) {
[1/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging
Repository: tika Updated Branches: refs/heads/2.x 61532258f -> 81150859b http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/log4j.properties -- diff --git a/tika-eval/src/main/resources/log4j.properties b/tika-eval/src/main/resources/log4j.properties new file mode 100644 index 000..925f9f2 --- /dev/null +++ b/tika-eval/src/main/resources/log4j.properties @@ -0,0 +1,11 @@ + +log4j.rootLogger=WARN,A1 + +#for debugging +#log4j.rootLogger=TRACE,A1 + +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/tika-eval-comparison-config.xml -- diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml index 2c51616..8070672 100644 --- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml +++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml @@ -28,8 +28,6 @@ > - - @@ -72,7 +68,7 @@ crawlingInputDir="false" minJsonFileSizeBytes="-1" maxJsonFileSizeBytes="200" - commonTokens="resources/commontokens" + commonTokens="resources/common_tokens" /> http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/tika-eval-profiler-config.xml -- diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml index bd94b25..be7adf4 100644 --- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml +++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml @@ -27,16 +27,13 @@ timeoutThresholdMillis="30"> - - - @@ -66,7 +63,7 @@ + commonTokens="resources/common_tokens"/> http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java -- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java index 72e8008..6d4d4ef 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java @@ -59,7 +59,7 @@ public class SimpleComparerTest extends TikaTest { Paths.get("extractsA"), Paths.get("extractsB"), writer, -1, -1, ExtractReader.ALTER_METADATA_LIST.AS_IS); - AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/commontokens").toPath()); + AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath()); LanguageIDWrapper.loadBuiltInModels(); } http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java -- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java index c358149..ff0961c 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java @@ -30,7 +30,7 @@ public class TikaEvalCLITest { public void testBasic() throws Exception { List args = new ArrayList<>(); args.add("Profile"); -args.add("-extractDir"); +args.add("-extracts"); args.add("tika"); args.add("-db"); args.add("mydb"); http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/resources/common_tokens/en -- diff --git a/tika-eval/src/test/resources/common_tokens/en b/tika-eval/src/test/resources/common_tokens/en new file mode 100644 index 000..8d442fe --- /dev/null +++ b/tika-eval/src/test/resources/common_tokens/en @@ -0,0 +1,8 @@ +the +of +and +a +or +#quick +brown +fox \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/resources/common_tokens/es -- diff --git a/tika-eval/src/test/resources/common_tokens/es b/tika-eval/src/test/resources/common_tokens/es new file mode 100644 index 000..b9bfd03 --- /dev/null +++
[3/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging
http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/common_tokens/en -- diff --git a/tika-eval/src/main/resources/common_tokens/en b/tika-eval/src/main/resources/common_tokens/en new file mode 100644 index 000..7426945 --- /dev/null +++ b/tika-eval/src/main/resources/common_tokens/en @@ -0,0 +1,2 @@ +with +from +which +that +also +this +were +first +other +after +been +have +when +their +more +there +into +time +over +they +during +years +most +known +only +some +made +including +___url___ +between +under +where +about +part +later +many +three +history +such +used +then +than +united +well +while +both +being +early +states +through +year +american +became +them +these +name +called +however +before +since +would +several +until +world +second +people +following +same +high +city +area +born +four +number +life +national +family +based +north +state +named +south +those +although +because +another +work +long +like +around +each +according +former +place +along +major +line +john +still +general +large +group +small +began +school +found +will +within +located +much +west +include +often +back +very +five +last +could +present +home +against +march +main +together +june +public +series +january +october +september +great +even +july +among +every +took +included +left +late +april +best +what +just +east +system +century +down +order +times +original +august +near +december +white +member +november +become +local +house +university +total +held +third +children +given +different +government +make +various +having +death +land +international +without +british +population +received +company +though +died +using +members +english +again +married +february +county +town +built +single +considered +point +created +came +service +served +popular +next +take +established +period +similar +once +others +originally +short +central +said +york +full +career +side +making +further +published +living +released +moved +common +development +water +never +modern +important +show +power +below +book +went +result +country +support +little +music +less +example +role +continued +played +produced +written +western +addition +upon +film +days +final +developed +river +size +term +throughout +president +post +formed +right +black +special +started +half +current +either +community +founded +young +eventually +instead +usually +good +center +office +seen +thus +taken +control +lost +sometimes +works +rather +open +william +free +father +returned +french +average +almost +college +does +political +level +education +district +notable +james +live +older +followed +george +england +version +wife +television +across +king +party +position +despite +female +northern +production +working +described +himself +building +southern +case +seven +america +play +available +includes +close +million +team +wrote +today +list +largest +areas +should +record +whose +above +region +park +worked +generally +appeared +especially +itself +middle +away +london +native +german +leading +remained +months +joined +square +least +personal +events +military +gave +outside +road +return +beginning +david +currently +refer +site +units +robert +business +lead +alone +parts +possible +provided +station +field +soon +official +class +church +opened +europe +force +features +union +able +change +army +must +related +european +eastern +royal +replaced +story +help +race +individuals +association +brought +families +means +street +come +independent +range +summer +involved +society +designed +eight +character +referred +changed +announced +light +award +significant +ever +kingdom +council +famous +earlier +night +charles +island +private +previously +future +introduced +process +services +type +added +geography +recorded +director +project +provide +successful +france +uses +lower +spread +sold +information +husband +court +previous +social +language +canada +program +limited +village +human +african +fact +particularly +hall +game +taking +interest +design +culture +action +chief +research +completed +census +real +prior +numerous +band +paul +civil +lived +media +song +allowed +recent +additional +department +season +construction +whom +success +higher +radio +longer +complete +records +featured +initially +primary +housing +places +certain +club +terms +battle +groups +thomas +behind +hand +rock +better +board +already +traditional +finally +particular +decided +release +placed +associated +required +entire +always +notes +standard +give +forces +science +someone +mother +front +sent +mostly +past +fire +here +approximately +rest +elected +active +star +performance +remains +natural +income +wide +shows +space +location +professional +start +thought +fourth +michael +density +study +larger +california +performed +germany +richard +cross +playing +caused +daughter +turn +leader +division +word +brother +nearly +love +commercial +appointed +player
[4/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging
TIKA-1332 -- add English Spanish common tokens; fix logging Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/81150859 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/81150859 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/81150859 Branch: refs/heads/2.x Commit: 81150859bdb25fe7faec575f5b916c8efad963cb Parents: 6153225 Author: tballisonAuthored: Thu Feb 16 20:12:01 2017 -0500 Committer: tballison Committed: Thu Feb 16 20:12:01 2017 -0500 -- .../org/apache/tika/eval/AbstractProfiler.java |17 +- .../org/apache/tika/eval/ExtractProfiler.java |24 +- .../java/org/apache/tika/eval/TikaEvalCLI.java |22 +- .../tika/eval/batch/EvalConsumersBuilder.java | 2 +- .../eval/batch/SingleFileConsumerBuilder.java |18 +- .../eval/tokens/CommonTokenCountManager.java|75 +- tika-eval/src/main/resources/common_tokens/en | 2 + tika-eval/src/main/resources/common_tokens/es | 19997 tika-eval/src/main/resources/log4j.properties |11 + .../resources/tika-eval-comparison-config.xml | 6 +- .../resources/tika-eval-profiler-config.xml | 7 +- .../apache/tika/eval/SimpleComparerTest.java| 2 +- .../org/apache/tika/eval/TikaEvalCLITest.java | 2 +- tika-eval/src/test/resources/common_tokens/en | 8 + tika-eval/src/test/resources/common_tokens/es |10 + .../src/test/resources/common_tokens/zh-cn | 8 + .../src/test/resources/common_tokens/zh-tw | 8 + tika-eval/src/test/resources/commontokens/en| 8 - tika-eval/src/test/resources/commontokens/es|10 - tika-eval/src/test/resources/commontokens/zh-cn | 8 - tika-eval/src/test/resources/commontokens/zh-tw | 8 - tika-eval/src/test/resources/log4j.properties |11 - .../src/test/resources/log4j_process.properties |11 - ...ingle-file-profiler-crawl-extract-config.xml | 4 +- .../single-file-profiler-crawl-input-config.xml | 4 +- 25 files changed, 40143 insertions(+), 138 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index 24f7358..daa964a 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -158,6 +158,11 @@ public abstract class AbstractProfiler extends FileResourceConsumer { final LanguageIDWrapper langIder; protected IDBWriter writer; +/** + * + * @param p path to the common_tokens directory. If this is null, try to load from classPath + * @throws IOException + */ public static void loadCommonTokens(Path p) throws IOException { commonTokenCountManager = new CommonTokenCountManager(p); } @@ -536,29 +541,29 @@ public abstract class AbstractProfiler extends FileResourceConsumer { /** * * @param metadata - * @param extractDir + * @param extracts * @return evalfilepaths for files if crawling an extract directory */ protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, - Path extractDir) { + Path extracts) { String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH); Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath); Path relativeSourceFilePath = Paths.get(m.replaceAll("")); //just try slapping the relextractfilepath on the extractdir -Path extractFile = extractDir.resolve(relExtractFilePath); +Path extractFile = extracts.resolve(relExtractFilePath); if (! Files.isRegularFile(extractFile)) { //if that doesn't work, try to find the right extract file. //This is necessary if crawling extractsA and trying to find a file in //extractsB that is not in the same format: json vs txt or compressed -extractFile = findFile(extractDir, relativeSourceFilePath); +extractFile = findFile(extracts, relativeSourceFilePath); } return new EvalFilePaths(relativeSourceFilePath, extractFile); } //call this if the crawler is crawling through the src directory protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, - Path extractDir) { + Path extracts) {
[2/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging
http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/common_tokens/es -- diff --git a/tika-eval/src/main/resources/common_tokens/es b/tika-eval/src/main/resources/common_tokens/es new file mode 100644 index 000..2889e7c --- /dev/null +++ b/tika-eval/src/main/resources/common_tokens/es @@ -0,0 +1,19997 @@ +como +para +tambien +esta +entre +este +desde +anos +hasta +parte +donde +sobre +___url___ +durante +pero +historia +tiene +vease +primera +nombre +ciudad +despues +cuando +otros +gran +encuentra +cual +familia +primer +estado +mismo +estados +solo +siglo +tres +otras +unidos +segun +ademas +habia +forma +puede +fueron +nacional +bajo +espana +aunque +poblacion +lugar +junto +sido +mayor +tras +todo +oficial +ingles +tiempo +siendo +nueva +todos +estaba +general +antes +vida +tenia +nuevo +hacia +grupo +tanto +segunda +quien +cada +algunos +mientras +personas +embargo +varios +norte +espanol +julio +habitantes +debido +total +estos +provincia +luego +estadounidense +tuvo +bibliografia +eran +mundo +contra +otro +centro +region +cuatro +mayo +principal +entonces +pais +conocido +universidad +trabajo +cuenta +unos +septiembre +enero +octubre +guerra +marzo +serie +segundo +junio +diciembre +final +otra +dentro +agosto +especie +abril +paso +noviembre +ellos +internacional +tierra +actualmente +numero +varias +esto +algunas +hace +juan +poco +hizo +carrera +estas +traves +estan +febrero +jose +biografia +muchos +partir +medio +demografia +estudio +grandes +mejor +agua +misma +sitio +largo +caso +equipo +tarde +siguiente +obra +tipo +gobierno +publico +todas +bien +ultimo +llamado +posteriormente +primeros +geografia +mundial +pueden +casa +cualquier +sistema +maria +zona +importante +momento +desarrollo +real +frente +epoca +hijo +menos +comenzo +hecho +actual +ella +toda +obras +muerte +diferentes +termino +departamento +orden +central +partido +titulo +municipio +veces +cerca +argentina +casi +base +principales +origen +lista +cuales +dias +tienen +francia +periodo +edad +siguientes +presidente +padre +mayoria +manera +finalmente +pagina +distrito +estudios +cinco +localidad +hacer +punto +alrededor +superficie +haber +capital +club +carlos +nivel +censo +santa +pueblo +mexico +poder +espanola +condado +finales +ante +ubicado +nacio +distribucion +cambio +america +republica +ambos +fuera +conocida +tener +genero +television +director +llego +puesto +mediante +ejemplo +musica +caracteristicas +ultima +luis +pesar +europa +siempre +estuvo +lado +llamada +escuela +miembro +cargo +paises +importantes +inicio +area +linea +primero +muchas +alto +reino +futbol +mucho +edicion +incluso +politica +publicado +propio +produccion +oficina +francisco +oeste +popular +convirtio +notas +division +media +antonio +encuentran +version +temporada +cabo +premio +miembros +madrid +iglesia +original +union +frances +francesa +exito +superior +construccion +unico +principalmente +antiguo +recibio +porque +pelicula +libro +alli +informacion +direccion +igual +densidad +grande +sino +debe +campo +hombres +banda +compuesto +acuerdo +seria +politico +anterior +relacion +proyecto +decada +ello +seis +alta +menor +estilo +metros +principios +cuerpo +resto +posicion +meses +realizo +incluyendo +datos +ahora +cultura +mujeres +categoria +situada +servicio +york +programa +sede +militar +especial +comunidad +perteneciente +especialmente +unas +antigua +profesional +sociedad +interior +ellas +habian +nacido +coordenadas +alemania +instituto +obtuvo +pedro +natural +descripcion +local +trata +proceso +social +conjunto +organizacion +costa +pequeno +incluye +gracias +vista +album +actualidad +compania +aparece +estaban +participo +propia +estar +tercera +honor +formacion +ultimos +comun +color +joven +llegar +personal +italia +nombrado +revista +nunca +diversos +papel +madre +arte +pacifico +hombre +john +dicho +unica +pues +tercer +grupos +decir +manuel +especies +creacion +isla +situado +referencia +cuyo +primeras +fecha +posible +modo +interes +hijos +lugares +derecho +unido +canciones +pequena +buenos +cine +situacion +mujer +poblacional +cancion +muestra +gano +medios +territorio +ubicada +resultado +movimiento +millones +problemas +llevo +actividad +paris +blanco +martin +miguel +hija +publica +servicios +chile +nuevos +altura +civil +existen +algo +autor +principio +blancos +empresa +plantas +hermano +ejercito +diversas +ocasiones +aproximadamente +libre +partidos +dado +musical +campeonato +regreso +trayectoria +puntos +partes +apoyo +baja +cantidad +lleva +tenian +dicha +aleman +nacionales +causa +victoria +existe +nuevas +posee +isbn +teatro +control +geografica +personajes +premios +ciudades +cuya +asociacion +murio +juegos +similar +clubes +santiago +participacion +presencia +seleccion +presenta +decidio +museo +volvio +siete +liga +fundacion +logro +actividades +equipos +raza +espacio +fuerza +objetivo +cultural +fuerte
tika git commit: TIKA-1332 -- fix analyzer chain for common tokens, clean up UTF-8 references
Repository: tika Updated Branches: refs/heads/master 6c6b77b41 -> a2d214c71 TIKA-1332 -- fix analyzer chain for common tokens, clean up UTF-8 references Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a2d214c7 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a2d214c7 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a2d214c7 Branch: refs/heads/master Commit: a2d214c71602f4f5a84635adc38c43182a39a390 Parents: 6c6b77b Author: tballisonAuthored: Thu Feb 16 15:41:53 2017 -0500 Committer: tballison Committed: Thu Feb 16 15:41:53 2017 -0500 -- .../org/apache/tika/eval/io/ExtractReader.java | 3 +- .../tika/eval/tokens/AnalyzerManager.java | 3 +- .../apache/tika/eval/tokens/TokenIntPair.java | 4 +-- .../src/main/resources/lucene-analyzers.json| 32 +--- 4 files changed, 33 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java index cd90f76..2631f44 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java @@ -5,6 +5,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -78,7 +79,7 @@ public class ExtractReader { LOGGER.warn("Can't yet process compression of type: "+fileSuffixes.compression); } } -reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); +reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); if (fileSuffixes.txtOrJson.equals("json")) { metadataList = JsonMetadataList.fromJson(reader); http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java index db6ae26..774b19a 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.nio.charset.StandardCharsets; import java.util.Map; import com.google.gson.Gson; @@ -47,7 +48,7 @@ public class AnalyzerManager { public static AnalyzerManager newInstance() throws IOException { InputStream is = AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json"); -Reader reader = new InputStreamReader(is, "UTF-8"); +Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8); GsonBuilder builder = new GsonBuilder(); builder.registerTypeHierarchyAdapter(Map.class, new AnalyzerDeserializer()); Gson gson = builder.create(); http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java index 4b57d25..a924f07 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java @@ -17,8 +17,6 @@ package org.apache.tika.eval.tokens; -import org.jetbrains.annotations.NotNull; - public class TokenIntPair implements Comparable { final String token; @@ -63,7 +61,7 @@ public class TokenIntPair implements Comparable { * @return comparison */ @Override -public int compareTo(@NotNull TokenIntPair o) { +public int compareTo(TokenIntPair o) { if (this.value > o.value) { return -1; } else if (this.value < o.value) { http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/resources/lucene-analyzers.json -- diff --git
tika git commit: TIKA-1332 3rd time's the charm. Fix dependencies with IOUtils.
Repository: tika Updated Branches: refs/heads/2.x 44612ae40 -> 61532258f TIKA-1332 3rd time's the charm. Fix dependencies with IOUtils. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/61532258 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/61532258 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/61532258 Branch: refs/heads/2.x Commit: 61532258f2ff44787050f0f3a0bb8ba17d8e50b0 Parents: 44612ae Author: tballisonAuthored: Thu Feb 16 14:41:13 2017 -0500 Committer: tballison Committed: Thu Feb 16 14:41:13 2017 -0500 -- .../main/java/org/apache/tika/eval/XMLErrorLogUpdater.java| 2 +- tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java | 2 +- tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java | 2 +- .../src/main/java/org/apache/tika/eval/io/ExtractReader.java | 2 +- .../src/main/java/org/apache/tika/eval/io/XMLLogReader.java | 7 --- .../main/java/org/apache/tika/eval/tokens/TokenIntPair.java | 4 +--- 6 files changed, 9 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java index 9a7e7aa..eaaf228 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java @@ -32,6 +32,7 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; +import org.apache.commons.io.IOExceptionWithCause; import org.apache.log4j.Level; import org.apache.tika.eval.db.Cols; import org.apache.tika.eval.db.DBUtil; @@ -40,7 +41,6 @@ import org.apache.tika.eval.db.TableInfo; import org.apache.tika.eval.io.XMLLogMsgHandler; import org.apache.tika.eval.io.XMLLogReader; import org.apache.tika.eval.reports.ResultsReporter; -import org.apache.tika.io.IOExceptionWithCause; import org.slf4j.Logger; import org.slf4j.LoggerFactory; http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java index 1efa48a..d99798d 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java @@ -31,8 +31,8 @@ import java.util.Locale; import java.util.Map; import java.util.Set; +import org.apache.commons.io.IOExceptionWithCause; import org.apache.log4j.Logger; -import org.apache.tika.io.IOExceptionWithCause; public abstract class DBUtil { http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java index db4cd04..383f25c 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; +import org.apache.commons.io.IOExceptionWithCause; import org.apache.log4j.Logger; import org.apache.tika.config.TikaConfig; import org.apache.tika.eval.db.ColInfo; @@ -32,7 +33,6 @@ import org.apache.tika.eval.db.Cols; import org.apache.tika.eval.db.DBUtil; import org.apache.tika.eval.db.MimeBuffer; import org.apache.tika.eval.db.TableInfo; -import org.apache.tika.io.IOExceptionWithCause; /** * This is still in its early stages. The idea is to http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java index cd90f76..f703408 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java @@ -15,9 +15,9 @@ import java.util.regex.Pattern; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import
tika git commit: TIKA-1332 -- clean up commons-io version mgmt
Repository: tika Updated Branches: refs/heads/master d194ba402 -> 6c6b77b41 TIKA-1332 -- clean up commons-io version mgmt Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c6b77b4 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c6b77b4 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c6b77b4 Branch: refs/heads/master Commit: 6c6b77b4159d4e7bbebd883cb52f2160be9cc5a6 Parents: d194ba4 Author: tballisonAuthored: Thu Feb 16 13:39:26 2017 -0500 Committer: tballison Committed: Thu Feb 16 13:39:26 2017 -0500 -- tika-eval/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/6c6b77b4/tika-eval/pom.xml -- diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index f1758cc..8bc7680 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -87,7 +87,7 @@ commons-io commons-io -2.4 +${commons.io.version}
tika git commit: TIKA-1332 fix pom for 2.0
Repository: tika Updated Branches: refs/heads/2.x 0d04b499a -> 44612ae40 TIKA-1332 fix pom for 2.0 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/44612ae4 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/44612ae4 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/44612ae4 Branch: refs/heads/2.x Commit: 44612ae405d1342661387f74320e13c96301754b Parents: 0d04b49 Author: tballisonAuthored: Thu Feb 16 13:37:26 2017 -0500 Committer: tballison Committed: Thu Feb 16 13:37:26 2017 -0500 -- tika-eval/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/44612ae4/tika-eval/pom.xml -- diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index 9167742..ce85229 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -35,7 +35,7 @@ org.apache.tika tika-parent -1.15-SNAPSHOT +2.0-SNAPSHOT ../tika-parent/pom.xml @@ -87,7 +87,7 @@ commons-io commons-io -2.4 +${commons.io.version}
tika git commit: TIKA-1332 downgrade to Lucene 5.x so that this can run w/ Java 7
Repository: tika Updated Branches: refs/heads/2.x 69dd0328b -> 0d04b499a TIKA-1332 downgrade to Lucene 5.x so that this can run w/ Java 7 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0d04b499 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0d04b499 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0d04b499 Branch: refs/heads/2.x Commit: 0d04b499a6c305c6c0656f37abfd6f78440ea309 Parents: 69dd032 Author: tballisonAuthored: Thu Feb 16 12:59:28 2017 -0500 Committer: tballison Committed: Thu Feb 16 12:59:28 2017 -0500 -- tika-eval/pom.xml | 3 ++- .../org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java | 2 +- .../tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/0d04b499/tika-eval/pom.xml -- diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index ee0940c..9167742 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -26,7 +26,8 @@ 4.0.0 1.3.1 -6.2.1 + +5.5.3 3.16-beta2 http://git-wip-us.apache.org/repos/asf/tika/blob/0d04b499/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java index fb72e84..2c046ad 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java @@ -19,9 +19,9 @@ package org.apache.tika.eval.tokens; import java.io.IOException; import java.util.Map; -import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.TokenFilterFactory; /** http://git-wip-us.apache.org/repos/asf/tika/blob/0d04b499/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java index 31fa866..549e85d 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java @@ -3,11 +3,11 @@ package org.apache.tika.eval.tokens; import java.io.IOException; import java.util.Map; -import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.TokenFilterFactory; /**
tika git commit: TIKA-1332 -- downgrade Lucene to 5.x to allow for Java 7
Repository: tika Updated Branches: refs/heads/master 506b57256 -> d194ba402 TIKA-1332 -- downgrade Lucene to 5.x to allow for Java 7 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d194ba40 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d194ba40 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d194ba40 Branch: refs/heads/master Commit: d194ba4022dffa61cad2a12ea0092f6ec00588d2 Parents: 506b572 Author: tballisonAuthored: Thu Feb 16 12:57:22 2017 -0500 Committer: tballison Committed: Thu Feb 16 12:57:22 2017 -0500 -- tika-eval/pom.xml | 3 ++- .../org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java | 2 +- .../tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/d194ba40/tika-eval/pom.xml -- diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index ec2c18b..f1758cc 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -26,7 +26,8 @@ 4.0.0 1.3.1 -6.2.1 + +5.5.3 3.16-beta2 http://git-wip-us.apache.org/repos/asf/tika/blob/d194ba40/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java index fb72e84..2c046ad 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java @@ -19,9 +19,9 @@ package org.apache.tika.eval.tokens; import java.io.IOException; import java.util.Map; -import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.TokenFilterFactory; /** http://git-wip-us.apache.org/repos/asf/tika/blob/d194ba40/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java index 31fa866..549e85d 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java @@ -3,11 +3,11 @@ package org.apache.tika.eval.tokens; import java.io.IOException; import java.util.Map; -import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.util.FilteringTokenFilter; import org.apache.lucene.analysis.util.TokenFilterFactory; /**
tika git commit: TIKA-1332 -- fix one report for eval profiler and clean up whitespace
Repository: tika Updated Branches: refs/heads/master aa7a0c353 -> 506b57256 TIKA-1332 -- fix one report for eval profiler and clean up whitespace Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/506b5725 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/506b5725 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/506b5725 Branch: refs/heads/master Commit: 506b572560f6c7f44270b55877f110719a7d4b1f Parents: aa7a0c3 Author: tballisonAuthored: Thu Feb 16 12:33:58 2017 -0500 Committer: tballison Committed: Thu Feb 16 12:33:58 2017 -0500 -- .../src/main/resources/comparison-reports.xml | 2 +- .../src/main/resources/lucene-analyzers.json| 30 +++-- .../src/main/resources/profile-reports.xml | 11 ++-- .../resources/tika-eval-comparison-config.xml | 65 ++-- ...ingle-file-profiler-crawl-extract-config.xml | 2 +- .../single-file-profiler-crawl-input-config.xml | 2 +- 6 files changed, 52 insertions(+), 60 deletions(-) -- http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/comparison-reports.xml -- diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml index cb7befd..d69cb2a 100644 --- a/tika-eval/src/main/resources/comparison-reports.xml +++ b/tika-eval/src/main/resources/comparison-reports.xml @@ -206,7 +206,7 @@ - http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/lucene-analyzers.json -- diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json index 268494f..f7141f7 100644 --- a/tika-eval/src/main/resources/lucene-analyzers.json +++ b/tika-eval/src/main/resources/lucene-analyzers.json @@ -1,12 +1,11 @@ { "analyzers": { -"general" : -{ +"general": { "charfilters": [ { "factory": "oala.charfilter.MappingCharFilterFactory", "params": { -"mapping" : "/lucene-char-mapping.txt" +"mapping": "/lucene-char-mapping.txt" } } ], @@ -22,20 +21,17 @@ { "factory": "oala.cjk.CJKBigramFilterFactory", "params": { -"outputUnigrams" : "false" +"outputUnigrams": "false" } } ] - }, - -"alpha" : -{ +"alpha": { "charfilters": [ { "factory": "oala.charfilter.MappingCharFilterFactory", "params": { -"mapping" : "/lucene-char-mapping.txt" +"mapping": "/lucene-char-mapping.txt" } } ], @@ -67,7 +63,7 @@ { "factory": "oala.cjk.CJKBigramFilterFactory", "params": { -"outputUnigrams" : "false" +"outputUnigrams": "false" } }, { @@ -75,33 +71,27 @@ "params": {} } ] - }, -"common_tokens" : -{ +"common_tokens": { "tokenizer": { "factory": "oala.standard.UAX29URLEmailTokenizerFactory", "params": {} }, - "tokenfilters": [ { "factory": "oala.cjk.CJKBigramFilterFactory", "params": { -"outputUnigrams" : "false" +"outputUnigrams": "false" } }, { "factory": "org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory", "params": { -"min" : 4, -"max" : 20 +"min": 4, +"max": 20 } } - ] - } - } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/profile-reports.xml -- diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml index 2a94a97..1f9be6a 100644 --- a/tika-eval/src/main/resources/profile-reports.xml +++ b/tika-eval/src/main/resources/profile-reports.xml @@ -98,7 +98,6 @@ - -select LANG_ID_1 as DetectedLang, count(1) as cnt -from contents -group by LANG_ID_1 -order by cnt desc +select parse_exception_description, count(1) cnt +from parse_exceptions e +join profiles p on p.id = e.id +join ref_parse_exception_types et on et.parse_exception_type_id=e.parse_exception_type_id +group by parse_exception_description +order by cnt desc;
[5/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java new file mode 100644 index 000..5860327 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.ParseException; +import org.apache.tika.batch.fs.FSBatchProcessCLI; +import org.apache.tika.eval.reports.ResultsReporter; +import org.h2.tools.Console; + +public class TikaEvalCLI { +static final String[] tools = {"Profile", "Compare", "Report", "StartDB"}; + +private static String specifyTools() { +StringBuilder sb = new StringBuilder(); +sb.append("Must specify one of the following tools in the first parameter:\n"); +for (String s : tools) { +sb.append(s+"\n"); +} +return sb.toString(); + +} + +private void execute(String[] args) throws Exception { +String tool = args[0]; +String[] subsetArgs = new String[args.length-1]; +System.arraycopy(args, 1, subsetArgs, 0, args.length - 1); +if (tool.equals("Report")) { +handleReport(subsetArgs); +} else if (tool.equals("Compare")) { +handleCompare(subsetArgs); +} else if (tool.equals("Profile")) { +handleProfile(subsetArgs); +} else if (tool.equals("StartDB")) { +handleStartDB(subsetArgs); +} else { +System.out.println(specifyTools()); +} +} + +private void handleStartDB(String[] args) throws SQLException { +List argList = new ArrayList<>(); +argList.add("-web"); +Console.main(argList.toArray(new String[argList.size()])); +while(true) { +try { +Thread.sleep(1000); +} catch (InterruptedException e){ +break; +} +} +} + +private void handleProfile(String[] subsetArgs) throws Exception { +List argList = new ArrayList(Arrays.asList(subsetArgs)); + +boolean containsBC = false; +String inputDir = null; +String extractDir = null; +String alterExtract = null; +//confirm there's a batch-config file +for (int i = 0; i < argList.size(); i++) { +String arg = argList.get(i); +if (arg.equals("-bc")) { +containsBC = true; +} else if (arg.equals("-inputDir")) { +if (i+1 >= argList.size()) { +System.err.println("Must specify directory after -inputDir"); +ExtractProfiler.USAGE(); +return; +} +inputDir = argList.get(i+1); +i++; +} else if (arg.equals("-extractDir")) { +if (i+1 >= argList.size()) { +System.err.println("Must specify directory after -extractDir"); +ExtractProfiler.USAGE(); +return; +} +extractDir = argList.get(i+1); +i++; +} else if (arg.equals("-alterExtract")) { +if (i+1 >= argList.size()) { +System.err.println("Must specify directory after -extractsB"); +ExtractComparer.USAGE(); +return; +} +alterExtract = argList.get(i+1); +i++; +} +} + +if (alterExtract != null && !alterExtract.equals("as_is") && +!alterExtract.equals("concatenate_content") && +!alterExtract.equals("first_only")) { +
[4/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java new file mode 100644 index 000..cd90f76 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java @@ -0,0 +1,161 @@ +package org.apache.tika.eval.io; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.z.ZCompressorInputStream; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ExtractReader { + +public enum ALTER_METADATA_LIST { +AS_IS, //leave the metadata list as is +FIRST_ONLY, //take only the metadata list for the "container" document +CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first +} +private final static Logger LOGGER = LoggerFactory.getLogger(ExtractReader.class); +TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + +public List loadExtract(Path thisFile, ALTER_METADATA_LIST alterExtractList) { +List metadataList = null; +if (thisFile == null || !Files.isRegularFile(thisFile)) { +return metadataList; +} +Reader reader = null; +InputStream is = null; +FileSuffixes fileSuffixes = parseSuffixes(thisFile.getFileName().toString()); +if (fileSuffixes.txtOrJson == null) { +LOGGER.warn("file must end with .txt or .json: "+thisFile.getFileName().toString()); +return metadataList; +} + +try { +is = Files.newInputStream(thisFile); +if (fileSuffixes.compression != null) { +if (fileSuffixes.compression.equals("bz2")) { +is = new BZip2CompressorInputStream(is); +} else if (fileSuffixes.compression.equals("gz")) { +is = new GzipCompressorInputStream(is); +} else if (fileSuffixes.compression.equals("zip")) { +is = new ZCompressorInputStream(is); +} else { +LOGGER.warn("Can't yet process compression of type: "+fileSuffixes.compression); +} +} +reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + +if (fileSuffixes.txtOrJson.equals("json")) { +metadataList = JsonMetadataList.fromJson(reader); +if (alterExtractList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) { +while (metadataList.size() > 1) { +metadataList.remove(metadataList.size()-1); +} +} else if (alterExtractList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && +metadataList.size() > 1) { +StringBuilder sb = new StringBuilder(); +Metadata containerMetadata = metadataList.get(0); +for (int i = 0; i < metadataList.size(); i++) { +Metadata m = metadataList.get(i); +String c = m.get(RecursiveParserWrapper.TIKA_CONTENT); +if (c
[1/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.
Repository: tika Updated Branches: refs/heads/2.x 6bfe5d565 -> 5e49c3308 http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json new file mode 100644 index 000..6ef09de --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json @@ -0,0 +1,5 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog 1,200 12", + "xmpTPg:NPages":2 +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json new file mode 100644 index 000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json new file mode 100644 index 000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json new file mode 100644 index 000..0e2558b --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json new file mode 100644 index 000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { +"Content-Type":"text/plain", +"X-TIKA:embedded_resource_path":"inner.txt", +"X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json new file mode 100644 index 000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { +"Content-Type":"text/plain", +"X-TIKA:embedded_resource_path":"inner.txt", +"X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json new file mode 100644 index 000..18763d1 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json new file mode
[2/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java -- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java new file mode 100644 index 000..0d925cf --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.tika.batch.fs.FSBatchTestBase; +import org.apache.tika.eval.db.Cols; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +@Ignore("need to fix tika-batch tests to make this work") +public class ComparerBatchTest extends FSBatchTestBase { + +public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI"; + +private static Path dbDir; +private static Connection conn; + +private final static String compJoinCont = ""; +/*ExtractComparer.COMPARISONS_TABLE+" cmp " + +"join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+ +"on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+ +" = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/ + +@BeforeClass +public static void setUp() throws Exception { + +File inputRoot = new File(ComparerBatchTest.class.getResource("/test-dirs").toURI()); +dbDir = Files.createTempDirectory(inputRoot.toPath(), "tika-test-db-dir-"); +Mapargs = new HashMap<>(); +Path db = FileSystems.getDefault().getPath(dbDir.toString(), "comparisons_test"); +args.put("-db", db.toString()); + +//for debugging, you can use this to select only one file pair to load +//args.put("-includeFilePat", "file8.*"); +/* +BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args, +"/tika-batch-comparison-eval-config.xml"); +StreamStrings streamStrings = ex.execute(); +System.out.println(streamStrings.getErrString()); +System.out.println(streamStrings.getOutString()); +H2Util dbUtil = new H2Util(db); +conn = dbUtil.getConnection();*/ +} + +@AfterClass +public static void tearDown() throws Exception { + +conn.close(); + +FileUtils.deleteDirectory(dbDir.toFile()); +} + + +@Test +public void testSimpleDBWriteAndRead() throws Exception { +Set set = new HashSet<>(); +//filenames +List list = getColStrings(Cols.FILE_NAME.name(), +ExtractComparer.PROFILES_A.getName(), ""); +assertEquals(7, list.size()); +assertTrue(list.contains("file1.pdf")); + +//container ids in comparisons table +list = getColStrings(Cols.CONTAINER_ID.name(), +ExtractComparer.COMPARISON_CONTAINERS.getName(),""); +assertEquals(10, list.size()); +set.clear(); set.addAll(list); +assertEquals(10, set.size()); +/* +//ids in comparisons table +list = getColStrings(AbstractProfiler.HEADERS.ID.name(), +compTable,""); +assertEquals(9, list.size()); +set.clear(); set.addAll(list); +assertEquals(9, set.size());*/ +} + + + +/* +@Test +public void testFile1PDFRow() throws Exception { +String where = fp+"='file1.pdf'"; +Map data = getRow(compJoinCont, where); +String result =
[6/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.
TIKA-1332 initial commit of tika-eval. More work remains. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5e49c330 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5e49c330 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5e49c330 Branch: refs/heads/2.x Commit: 5e49c33087bbf03763b05efda3bbb96d8cc20ea4 Parents: 6bfe5d5 Author: tballisonAuthored: Thu Feb 16 12:19:54 2017 -0500 Committer: tballison Committed: Thu Feb 16 12:19:54 2017 -0500 -- CHANGES.txt | 2 + LICENSE.txt | 8 + pom.xml | 1 + tika-eval/pom.xml | 281 +++ .../org/apache/tika/eval/AbstractProfiler.java | 693 .../org/apache/tika/eval/EvalFilePaths.java | 108 +++ .../org/apache/tika/eval/ExtractComparer.java | 455 +++ .../org/apache/tika/eval/ExtractProfiler.java | 238 ++ .../java/org/apache/tika/eval/TikaEvalCLI.java | 262 ++ .../apache/tika/eval/XMLErrorLogUpdater.java| 226 ++ .../tika/eval/batch/DBConsumersManager.java | 92 +++ .../tika/eval/batch/EvalConsumerBuilder.java| 134 .../tika/eval/batch/EvalConsumersBuilder.java | 133 .../tika/eval/batch/FileComparerBuilder.java| 122 +++ .../eval/batch/SingleFileConsumerBuilder.java | 108 +++ .../apache/tika/eval/db/AbstractDBBuffer.java | 77 ++ .../java/org/apache/tika/eval/db/ColInfo.java | 116 +++ .../main/java/org/apache/tika/eval/db/Cols.java | 90 +++ .../java/org/apache/tika/eval/db/DBBuffer.java | 54 ++ .../java/org/apache/tika/eval/db/DBUtil.java| 201 + .../java/org/apache/tika/eval/db/H2Util.java| 71 ++ .../org/apache/tika/eval/db/MimeBuffer.java | 144 .../java/org/apache/tika/eval/db/TableInfo.java | 64 ++ .../java/org/apache/tika/eval/io/DBWriter.java | 141 .../org/apache/tika/eval/io/ExtractReader.java | 161 .../java/org/apache/tika/eval/io/IDBWriter.java | 31 + .../apache/tika/eval/io/XMLLogMsgHandler.java | 26 + .../org/apache/tika/eval/io/XMLLogReader.java | 120 +++ .../org/apache/tika/eval/reports/Report.java| 197 + .../tika/eval/reports/ResultsReporter.java | 295 +++ .../tika/eval/reports/XLSXHREFFormatter.java| 79 ++ .../tika/eval/reports/XLSXNumFormatter.java | 54 ++ .../tika/eval/reports/XSLXCellFormatter.java| 30 + .../tokens/AlphaIdeographFilterFactory.java | 74 ++ .../tika/eval/tokens/AnalyzerDeserializer.java | 345 .../tika/eval/tokens/AnalyzerManager.java | 95 +++ .../CJKBigramAwareLengthFilterFactory.java | 74 ++ .../eval/tokens/CommonTokenCountManager.java| 141 .../tika/eval/tokens/CommonTokenResult.java | 37 + .../tika/eval/tokens/ContrastStatistics.java| 78 ++ .../tika/eval/tokens/TokenContraster.java | 183 + .../eval/tokens/TokenCountPriorityQueue.java| 49 ++ .../apache/tika/eval/tokens/TokenCounter.java | 167 .../apache/tika/eval/tokens/TokenIntPair.java | 82 ++ .../tika/eval/tokens/TokenStatistics.java | 127 +++ .../tika/eval/util/LanguageIDWrapper.java | 69 ++ ...ache.lucene.analysis.util.TokenFilterFactory | 17 + .../src/main/resources/comparison-reports.xml | 791 +++ .../src/main/resources/lucene-analyzers.json| 107 +++ .../src/main/resources/lucene-char-mapping.txt | 2 + .../src/main/resources/profile-reports.xml | 148 .../resources/tika-eval-comparison-config.xml | 81 ++ .../resources/tika-eval-profiler-config.xml | 76 ++ .../test/java/org/apache/tika/MockDBWriter.java | 73 ++ .../apache/tika/eval/AnalyzerManagerTest.java | 79 ++ .../org/apache/tika/eval/ComparerBatchTest.java | 411 ++ .../org/apache/tika/eval/ProfilerBatchTest.java | 236 ++ .../apache/tika/eval/SimpleComparerTest.java| 289 +++ .../org/apache/tika/eval/TikaEvalCLITest.java | 42 + .../apache/tika/eval/db/AbstractBufferTest.java | 160 .../apache/tika/eval/io/ExtractReaderTest.java | 85 ++ .../tika/eval/io/FatalExceptionReaderTest.java | 32 + .../tika/eval/reports/ResultsReporterTest.java | 60 ++ .../tika/eval/tokens/LuceneTokenCounter.java| 191 + .../tika/eval/tokens/TokenCounterTest.java | 131 +++ .../org/apache/tika/eval/util/MimeUtilTest.java | 65 ++ tika-eval/src/test/resources/commontokens/en| 8 + tika-eval/src/test/resources/commontokens/es| 10 + tika-eval/src/test/resources/commontokens/zh-cn | 8 + tika-eval/src/test/resources/commontokens/zh-tw | 8 + tika-eval/src/test/resources/log4j.properties | 11 + .../src/test/resources/log4j_process.properties | 11 + ...ingle-file-profiler-crawl-extract-config.xml | 72 ++
[6/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.
TIKA-1332 -- initial commit for tika-eval module. More work remains. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/aa7a0c35 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/aa7a0c35 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/aa7a0c35 Branch: refs/heads/master Commit: aa7a0c353362d56cb1b8e77297f0807626b0246c Parents: b9befb4 Author: tballisonAuthored: Thu Feb 16 12:18:32 2017 -0500 Committer: tballison Committed: Thu Feb 16 12:18:32 2017 -0500 -- CHANGES.txt | 2 + LICENSE.txt | 8 + pom.xml | 1 + tika-eval/pom.xml | 281 +++ .../org/apache/tika/eval/AbstractProfiler.java | 693 .../org/apache/tika/eval/EvalFilePaths.java | 108 +++ .../org/apache/tika/eval/ExtractComparer.java | 455 +++ .../org/apache/tika/eval/ExtractProfiler.java | 238 ++ .../java/org/apache/tika/eval/TikaEvalCLI.java | 262 ++ .../apache/tika/eval/XMLErrorLogUpdater.java| 226 ++ .../tika/eval/batch/DBConsumersManager.java | 92 +++ .../tika/eval/batch/EvalConsumerBuilder.java| 134 .../tika/eval/batch/EvalConsumersBuilder.java | 133 .../tika/eval/batch/FileComparerBuilder.java| 122 +++ .../eval/batch/SingleFileConsumerBuilder.java | 108 +++ .../apache/tika/eval/db/AbstractDBBuffer.java | 77 ++ .../java/org/apache/tika/eval/db/ColInfo.java | 116 +++ .../main/java/org/apache/tika/eval/db/Cols.java | 90 +++ .../java/org/apache/tika/eval/db/DBBuffer.java | 54 ++ .../java/org/apache/tika/eval/db/DBUtil.java| 201 + .../java/org/apache/tika/eval/db/H2Util.java| 71 ++ .../org/apache/tika/eval/db/MimeBuffer.java | 144 .../java/org/apache/tika/eval/db/TableInfo.java | 64 ++ .../java/org/apache/tika/eval/io/DBWriter.java | 141 .../org/apache/tika/eval/io/ExtractReader.java | 161 .../java/org/apache/tika/eval/io/IDBWriter.java | 31 + .../apache/tika/eval/io/XMLLogMsgHandler.java | 26 + .../org/apache/tika/eval/io/XMLLogReader.java | 120 +++ .../org/apache/tika/eval/reports/Report.java| 197 + .../tika/eval/reports/ResultsReporter.java | 295 +++ .../tika/eval/reports/XLSXHREFFormatter.java| 79 ++ .../tika/eval/reports/XLSXNumFormatter.java | 54 ++ .../tika/eval/reports/XSLXCellFormatter.java| 30 + .../tokens/AlphaIdeographFilterFactory.java | 74 ++ .../tika/eval/tokens/AnalyzerDeserializer.java | 345 .../tika/eval/tokens/AnalyzerManager.java | 95 +++ .../CJKBigramAwareLengthFilterFactory.java | 74 ++ .../eval/tokens/CommonTokenCountManager.java| 141 .../tika/eval/tokens/CommonTokenResult.java | 37 + .../tika/eval/tokens/ContrastStatistics.java| 78 ++ .../tika/eval/tokens/TokenContraster.java | 183 + .../eval/tokens/TokenCountPriorityQueue.java| 49 ++ .../apache/tika/eval/tokens/TokenCounter.java | 167 .../apache/tika/eval/tokens/TokenIntPair.java | 82 ++ .../tika/eval/tokens/TokenStatistics.java | 127 +++ .../tika/eval/util/LanguageIDWrapper.java | 69 ++ ...ache.lucene.analysis.util.TokenFilterFactory | 17 + .../src/main/resources/comparison-reports.xml | 791 +++ .../src/main/resources/lucene-analyzers.json| 107 +++ .../src/main/resources/lucene-char-mapping.txt | 2 + .../src/main/resources/profile-reports.xml | 148 .../resources/tika-eval-comparison-config.xml | 83 ++ .../resources/tika-eval-profiler-config.xml | 76 ++ .../test/java/org/apache/tika/MockDBWriter.java | 73 ++ .../apache/tika/eval/AnalyzerManagerTest.java | 79 ++ .../org/apache/tika/eval/ComparerBatchTest.java | 411 ++ .../org/apache/tika/eval/ProfilerBatchTest.java | 236 ++ .../apache/tika/eval/SimpleComparerTest.java| 289 +++ .../org/apache/tika/eval/TikaEvalCLITest.java | 42 + .../apache/tika/eval/db/AbstractBufferTest.java | 160 .../apache/tika/eval/io/ExtractReaderTest.java | 85 ++ .../tika/eval/io/FatalExceptionReaderTest.java | 32 + .../tika/eval/reports/ResultsReporterTest.java | 60 ++ .../tika/eval/tokens/LuceneTokenCounter.java| 191 + .../tika/eval/tokens/TokenCounterTest.java | 131 +++ .../org/apache/tika/eval/util/MimeUtilTest.java | 65 ++ tika-eval/src/test/resources/commontokens/en| 8 + tika-eval/src/test/resources/commontokens/es| 10 + tika-eval/src/test/resources/commontokens/zh-cn | 8 + tika-eval/src/test/resources/commontokens/zh-tw | 8 + tika-eval/src/test/resources/log4j.properties | 11 + .../src/test/resources/log4j_process.properties | 11 + ...ingle-file-profiler-crawl-extract-config.xml | 72 ++
[2/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java -- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java new file mode 100644 index 000..0d925cf --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.tika.batch.fs.FSBatchTestBase; +import org.apache.tika.eval.db.Cols; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +@Ignore("need to fix tika-batch tests to make this work") +public class ComparerBatchTest extends FSBatchTestBase { + +public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI"; + +private static Path dbDir; +private static Connection conn; + +private final static String compJoinCont = ""; +/*ExtractComparer.COMPARISONS_TABLE+" cmp " + +"join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+ +"on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+ +" = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/ + +@BeforeClass +public static void setUp() throws Exception { + +File inputRoot = new File(ComparerBatchTest.class.getResource("/test-dirs").toURI()); +dbDir = Files.createTempDirectory(inputRoot.toPath(), "tika-test-db-dir-"); +Mapargs = new HashMap<>(); +Path db = FileSystems.getDefault().getPath(dbDir.toString(), "comparisons_test"); +args.put("-db", db.toString()); + +//for debugging, you can use this to select only one file pair to load +//args.put("-includeFilePat", "file8.*"); +/* +BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args, +"/tika-batch-comparison-eval-config.xml"); +StreamStrings streamStrings = ex.execute(); +System.out.println(streamStrings.getErrString()); +System.out.println(streamStrings.getOutString()); +H2Util dbUtil = new H2Util(db); +conn = dbUtil.getConnection();*/ +} + +@AfterClass +public static void tearDown() throws Exception { + +conn.close(); + +FileUtils.deleteDirectory(dbDir.toFile()); +} + + +@Test +public void testSimpleDBWriteAndRead() throws Exception { +Set set = new HashSet<>(); +//filenames +List list = getColStrings(Cols.FILE_NAME.name(), +ExtractComparer.PROFILES_A.getName(), ""); +assertEquals(7, list.size()); +assertTrue(list.contains("file1.pdf")); + +//container ids in comparisons table +list = getColStrings(Cols.CONTAINER_ID.name(), +ExtractComparer.COMPARISON_CONTAINERS.getName(),""); +assertEquals(10, list.size()); +set.clear(); set.addAll(list); +assertEquals(10, set.size()); +/* +//ids in comparisons table +list = getColStrings(AbstractProfiler.HEADERS.ID.name(), +compTable,""); +assertEquals(9, list.size()); +set.clear(); set.addAll(list); +assertEquals(9, set.size());*/ +} + + + +/* +@Test +public void testFile1PDFRow() throws Exception { +String where = fp+"='file1.pdf'"; +Map data = getRow(compJoinCont, where); +String result =
[3/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java new file mode 100644 index 000..28e1c78 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.apache.commons.math3.util.FastMath; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +public class TokenCounter { + +private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a"; + + +Map> map = new HashMap<>(); //Map > +Map tokenStatistics = new HashMap<>(); + +private final TokenStatistics NULL_TOKEN_STAT = new TokenStatistics( +0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics()); + +private final Analyzer generalAnalyzer; +private final Analyzer alphaIdeoAnalyzer; + +private int topN = 10; + +public TokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeoAnalyzer) throws IOException { +this.generalAnalyzer = generalAnalyzer; +this.alphaIdeoAnalyzer = alphaIdeoAnalyzer; +} + +public void add(String field, String content) throws IOException { +_add(field, generalAnalyzer, content); +_add(field+ALPHA_IDEOGRAPH_SUFFIX, alphaIdeoAnalyzer, content); +} + +private void _add(String field, Analyzer analyzer, String content) throws IOException { +int totalTokens = 0; + +TokenStream ts = analyzer.tokenStream(field, content); +CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); +ts.reset(); +Map tokenMap = map.get(field); +if (tokenMap == null) { +tokenMap = new HashMap<>(); +map.put(field, tokenMap); +} +while (ts.incrementToken()) { +String token = termAtt.toString(); +MutableInt cnt = tokenMap.get(token); +if (cnt == null) { +cnt = new MutableInt(1); +tokenMap.put(token, cnt); +} else { +cnt.increment(); +} +totalTokens++; +} +ts.close(); +ts.end(); + +int totalUniqueTokens = tokenMap.size(); + +double ent = 0.0d; +double p = 0.0d; +double base = 2.0; + +TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); + +SummaryStatistics summaryStatistics = new SummaryStatistics(); +for (Map.Entry e : tokenMap.entrySet()) { +String token = e.getKey(); +int termFreq = e.getValue().intValue(); + +p = (double) termFreq / (double) totalTokens; +ent += p * FastMath.log(base, p); +int len = token.codePointCount(0, token.length()); +for (int i = 0; i < e.getValue().intValue(); i++) { +summaryStatistics.addValue(len); +} +if (queue.top() == null || queue.size() < topN || +termFreq >= queue.top().getValue()) { +queue.insertWithOverflow(new TokenIntPair(token, termFreq)); +} + +} +if (totalTokens > 0) { +ent = (-1.0d / (double)totalTokens) * ent; +} + +/*Collections.sort(allTokens); +List topNList = new ArrayList<>(topN); +for (int i = 0; i < topN && i < allTokens.size(); i++) { +topNList.add(allTokens.get(i)); +}*/ + +tokenStatistics.put(field,
[4/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java -- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java new file mode 100644 index 000..cd90f76 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java @@ -0,0 +1,161 @@ +package org.apache.tika.eval.io; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.z.ZCompressorInputStream; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ExtractReader { + +public enum ALTER_METADATA_LIST { +AS_IS, //leave the metadata list as is +FIRST_ONLY, //take only the metadata list for the "container" document +CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first +} +private final static Logger LOGGER = LoggerFactory.getLogger(ExtractReader.class); +TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + +public List loadExtract(Path thisFile, ALTER_METADATA_LIST alterExtractList) { +List metadataList = null; +if (thisFile == null || !Files.isRegularFile(thisFile)) { +return metadataList; +} +Reader reader = null; +InputStream is = null; +FileSuffixes fileSuffixes = parseSuffixes(thisFile.getFileName().toString()); +if (fileSuffixes.txtOrJson == null) { +LOGGER.warn("file must end with .txt or .json: "+thisFile.getFileName().toString()); +return metadataList; +} + +try { +is = Files.newInputStream(thisFile); +if (fileSuffixes.compression != null) { +if (fileSuffixes.compression.equals("bz2")) { +is = new BZip2CompressorInputStream(is); +} else if (fileSuffixes.compression.equals("gz")) { +is = new GzipCompressorInputStream(is); +} else if (fileSuffixes.compression.equals("zip")) { +is = new ZCompressorInputStream(is); +} else { +LOGGER.warn("Can't yet process compression of type: "+fileSuffixes.compression); +} +} +reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + +if (fileSuffixes.txtOrJson.equals("json")) { +metadataList = JsonMetadataList.fromJson(reader); +if (alterExtractList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) { +while (metadataList.size() > 1) { +metadataList.remove(metadataList.size()-1); +} +} else if (alterExtractList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && +metadataList.size() > 1) { +StringBuilder sb = new StringBuilder(); +Metadata containerMetadata = metadataList.get(0); +for (int i = 0; i < metadataList.size(); i++) { +Metadata m = metadataList.get(i); +String c = m.get(RecursiveParserWrapper.TIKA_CONTENT); +if (c
[1/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.
Repository: tika Updated Branches: refs/heads/master b9befb427 -> aa7a0c353 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json new file mode 100644 index 000..6ef09de --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json @@ -0,0 +1,5 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog 1,200 12", + "xmpTPg:NPages":2 +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json new file mode 100644 index 000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json new file mode 100644 index 000..e69de29 http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json new file mode 100644 index 000..0e2558b --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el perro" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json new file mode 100644 index 000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { +"Content-Type":"text/plain", +"X-TIKA:embedded_resource_path":"inner.txt", +"X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json new file mode 100644 index 000..5371c87 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json @@ -0,0 +1,10 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" + }, + { +"Content-Type":"text/plain", +"X-TIKA:embedded_resource_path":"inner.txt", +"X-TIKA:content":"attachment contents" + } +] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json new file mode 100644 index 000..18763d1 --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json @@ -0,0 +1,4 @@ +[{ + "Content-Type":"text/plain", + "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog" +}] \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json -- diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json b/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json new file mode
[Tika Wiki] Update of "MockParser" by TimothyAllison
Dear Wiki user, You have subscribed to a wiki page or wiki category on "Tika Wiki" for change notification. The "MockParser" page has been changed by TimothyAllison: https://wiki.apache.org/tika/MockParser?action=diff=3=4 Please note that for 3., permanent hangs -- you cannot terminate the Thread. Thread's ''stop'', ''suspend'', ''destroy'' sound like they'll do the trick, but they won't. '''You need to kill the entire process.''' - As of Tika 1.15, we added a MockParser in the tika-core-tests.jar that will allow you to test your framework against 1-3. Simply add that jar to your class path and then include a xml file in your set of test documents, and crash, crash away. + As of Tika 1.15, we added a MockParser in the tika-core-tests.jar that will allow you to test your framework against items 1-3. Simply add that jar to your class path and then include a xml file in your set of test documents, and crash, crash away. == Usage == @@ -36, +36 @@ === Your Framework === Place the tika-core-tests.jar on your class path (NOT IN PRODUCTION!!!) and then add some mock.xml files to your batch of documents. - === Mock options ===
[Tika Wiki] Update of "MockParser" by TimothyAllison
Dear Wiki user, You have subscribed to a wiki page or wiki category on "Tika Wiki" for change notification. The "MockParser" page has been changed by TimothyAllison: https://wiki.apache.org/tika/MockParser?action=diff=1=2 == Background == So, you've tried Tika on a couple of files and all works well. Problem solved! + No. + - No. In very rare cases, Tika can so some really bad things. We try to fix these problems when we can, but if history is any indication (e.g. [[https://issues.apache.org/jira/browse/TIKA-1132|TIKA-1132]]), if you are processing millions of files, you'll need to defend against: + In very rare cases, Tika can so some really bad things. We try to fix these problems when we can, but if history is any indication (e.g. [[https://issues.apache.org/jira/browse/TIKA-1132|TIKA-1132]]), if you are processing millions/billions of files from the wild, you'll need to defend against: 1. Regular catchable exceptions 2. !OutOfMemory errors which can put the jvm in an unreliable state @@ -24, +26 @@ `java -cp "bin/*" org.apache.tika.TikaCLI mock_example.xml` === Tika-server === - Place the tika-server.jar and the tika-core.tests.jar in a "bin directory. + Place the tika-server.jar and the tika-core.tests.jar in a "bin" directory. - `java -cp "serverbin/*" org.apache.tika.server.TikaServerCli` + `java -cp "bin/*" org.apache.tika.server.TikaServerCli` + + Then curl away: + + `curl -T mock_example.xml http://localhost:9998/rmeta/text` === Your Framework === Place the tika-core-tests.jar on your class path (NOT IN PRODUCTION!!!) and then add some mock.xml files to your batch of documents. - - Then curl away: - - `curl -T mock_example.xml http://localhost:9998/rmeta/text` === Mock options === See the mock example.xml file in tika-parsers/src/test/resources/test-documents/mock. @@ -84, +86 @@ `` + == References == + 1. [[http://openpreservation.org/blog/2014/03/21/tika-ride-characterising-web-content-nanite/|Tika to Ride]] + 2. [[http://events.linuxfoundation.org/sites/events/files/slides/TikaEval_ACNA15_allison_herceg_v2.pdf|Evaluating Text Extraction]]
[Tika Wiki] Update of "MockParser" by TimothyAllison
Dear Wiki user, You have subscribed to a wiki page or wiki category on "Tika Wiki" for change notification. The "MockParser" page has been changed by TimothyAllison: https://wiki.apache.org/tika/MockParser New page: = MockParser = == Background == So, you've tried Tika on a couple of files and all works well. Problem solved! No. In very rare cases, Tika can so some really bad things. We try to fix these problems when we can, but if history is any indication (e.g. [[https://issues.apache.org/jira/browse/TIKA-1132|TIKA-1132]]), if you are processing millions of files, you'll need to defend against: 1. Regular catchable exceptions 2. !OutOfMemory errors which can put the jvm in an unreliable state 3. Permanent hangs (Tika can chew up massive amounts of resources and go ''forever'') 4. Security vulnerabilities (e.g. [[http://seclists.org/bugtraq/2016/Nov/40|CVE-2016-6809]] and [[http://seclists.org/oss-sec/2016/q2/413|CVE-2016-4434]]) Please note that for 3., permanent hangs -- you cannot terminate the Thread. Thread's ''stop'', ''suspend'', ''destroy'' sound like they'll do the trick, but they won't. '''You need to kill the entire process.''' As of Tika 1.15, we added a MockParser in the tika-core-tests.jar that will allow you to test your framework against 1-3. Simply add that jar to your class path and then include a xml file in your set of test documents, and crash, crash away. == Usage == === Tika-app === Place the tika-app.jar and the tika-core-tests.jar in a "bin" directory. `java -cp "bin/*" org.apache.tika.TikaCLI mock_example.xml` === Tika-server === Place the tika-server.jar and the tika-core.tests.jar in a "bin directory. `java -cp "serverbin/*" org.apache.tika.server.TikaServerCli` === Your Framework === Place the tika-core-tests.jar on your class path (NOT IN PRODUCTION!!!) and then add some mock.xml files to your batch of documents. Then curl away: `curl -T mock_example.xml http://localhost:9998/rmeta/text` === Mock options === See the mock example.xml file in tika-parsers/src/test/resources/test-documents/mock. This shows all of the examples of what you can do. ``` Nikolai Lobachevsky some content writing to System.out writing to System.err not another IOException ``
[Tika Wiki] Update of "FrontPage" by TimothyAllison
Dear Wiki user, You have subscribed to a wiki page or wiki category on "Tika Wiki" for change notification. The "FrontPage" page has been changed by TimothyAllison: https://wiki.apache.org/tika/FrontPage?action=diff=56=57 * [[Troubleshooting Tika]] * [[TikaParserNotes|Notes on Specific Parsers]] * [[TikaEval|Using the tika-eval Module]] + * [[MockParser|How to Test Your Framework's Handling of Tika Behaving Badly]] = MIME identification design/implementation = * [[BaysianMimeTypeSelector|Bayesian MIME selection]] - Tika's new Bayesian MIME selector.