[1/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master a2d214c71 -> dc2dcd4cc


http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/log4j.properties
--
diff --git a/tika-eval/src/main/resources/log4j.properties 
b/tika-eval/src/main/resources/log4j.properties
new file mode 100644
index 000..925f9f2
--- /dev/null
+++ b/tika-eval/src/main/resources/log4j.properties
@@ -0,0 +1,11 @@
+
+log4j.rootLogger=WARN,A1
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/tika-eval-comparison-config.xml
--
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml 
b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index 04ef658..88fdd0a 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -28,8 +28,6 @@
 >
 
 
-
 
 
-
 
 
 
@@ -72,7 +68,7 @@
crawlingInputDir="false"
minJsonFileSizeBytes="-1"
maxJsonFileSizeBytes="200"
-   commonTokens="resources/commontokens"
+   commonTokens="resources/common_tokens"
 />
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/tika-eval-profiler-config.xml
--
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml 
b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index bd94b25..be7adf4 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -27,16 +27,13 @@
 timeoutThresholdMillis="30">
 
 
-
-
 
 
 
-
 
@@ -66,7 +63,7 @@
 
 
+   commonTokens="resources/common_tokens"/>
 
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
--
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 72e8008..6d4d4ef 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -59,7 +59,7 @@ public class SimpleComparerTest extends TikaTest {
 Paths.get("extractsA"), Paths.get("extractsB"),
 writer, -1, -1,
 ExtractReader.ALTER_METADATA_LIST.AS_IS);
-
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/commontokens").toPath());
+
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath());
 LanguageIDWrapper.loadBuiltInModels();
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
--
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
index c358149..ff0961c 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -30,7 +30,7 @@ public class TikaEvalCLITest {
 public void testBasic() throws Exception {
 List args = new ArrayList<>();
 args.add("Profile");
-args.add("-extractDir");
+args.add("-extracts");
 args.add("tika");
 args.add("-db");
 args.add("mydb");

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/resources/common_tokens/en
--
diff --git a/tika-eval/src/test/resources/common_tokens/en 
b/tika-eval/src/test/resources/common_tokens/en
new file mode 100644
index 000..8d442fe
--- /dev/null
+++ b/tika-eval/src/test/resources/common_tokens/en
@@ -0,0 +1,8 @@
+the
+of
+and
+a
+or
+#quick
+brown
+fox
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/test/resources/common_tokens/es
--
diff --git a/tika-eval/src/test/resources/common_tokens/es 
b/tika-eval/src/test/resources/common_tokens/es
new file mode 100644
index 000..b9bfd03
--- /dev/null
+++ 

[2/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/common_tokens/es
--
diff --git a/tika-eval/src/main/resources/common_tokens/es 
b/tika-eval/src/main/resources/common_tokens/es
new file mode 100644
index 000..2889e7c
--- /dev/null
+++ b/tika-eval/src/main/resources/common_tokens/es
@@ -0,0 +1,19997 @@
+como
+para
+tambien
+esta
+entre
+este
+desde
+anos
+hasta
+parte
+donde
+sobre
+___url___
+durante
+pero
+historia
+tiene
+vease
+primera
+nombre
+ciudad
+despues
+cuando
+otros
+gran
+encuentra
+cual
+familia
+primer
+estado
+mismo
+estados
+solo
+siglo
+tres
+otras
+unidos
+segun
+ademas
+habia
+forma
+puede
+fueron
+nacional
+bajo
+espana
+aunque
+poblacion
+lugar
+junto
+sido
+mayor
+tras
+todo
+oficial
+ingles
+tiempo
+siendo
+nueva
+todos
+estaba
+general
+antes
+vida
+tenia
+nuevo
+hacia
+grupo
+tanto
+segunda
+quien
+cada
+algunos
+mientras
+personas
+embargo
+varios
+norte
+espanol
+julio
+habitantes
+debido
+total
+estos
+provincia
+luego
+estadounidense
+tuvo
+bibliografia
+eran
+mundo
+contra
+otro
+centro
+region
+cuatro
+mayo
+principal
+entonces
+pais
+conocido
+universidad
+trabajo
+cuenta
+unos
+septiembre
+enero
+octubre
+guerra
+marzo
+serie
+segundo
+junio
+diciembre
+final
+otra
+dentro
+agosto
+especie
+abril
+paso
+noviembre
+ellos
+internacional
+tierra
+actualmente
+numero
+varias
+esto
+algunas
+hace
+juan
+poco
+hizo
+carrera
+estas
+traves
+estan
+febrero
+jose
+biografia
+muchos
+partir
+medio
+demografia
+estudio
+grandes
+mejor
+agua
+misma
+sitio
+largo
+caso
+equipo
+tarde
+siguiente
+obra
+tipo
+gobierno
+publico
+todas
+bien
+ultimo
+llamado
+posteriormente
+primeros
+geografia
+mundial
+pueden
+casa
+cualquier
+sistema
+maria
+zona
+importante
+momento
+desarrollo
+real
+frente
+epoca
+hijo
+menos
+comenzo
+hecho
+actual
+ella
+toda
+obras
+muerte
+diferentes
+termino
+departamento
+orden
+central
+partido
+titulo
+municipio
+veces
+cerca
+argentina
+casi
+base
+principales
+origen
+lista
+cuales
+dias
+tienen
+francia
+periodo
+edad
+siguientes
+presidente
+padre
+mayoria
+manera
+finalmente
+pagina
+distrito
+estudios
+cinco
+localidad
+hacer
+punto
+alrededor
+superficie
+haber
+capital
+club
+carlos
+nivel
+censo
+santa
+pueblo
+mexico
+poder
+espanola
+condado
+finales
+ante
+ubicado
+nacio
+distribucion
+cambio
+america
+republica
+ambos
+fuera
+conocida
+tener
+genero
+television
+director
+llego
+puesto
+mediante
+ejemplo
+musica
+caracteristicas
+ultima
+luis
+pesar
+europa
+siempre
+estuvo
+lado
+llamada
+escuela
+miembro
+cargo
+paises
+importantes
+inicio
+area
+linea
+primero
+muchas
+alto
+reino
+futbol
+mucho
+edicion
+incluso
+politica
+publicado
+propio
+produccion
+oficina
+francisco
+oeste
+popular
+convirtio
+notas
+division
+media
+antonio
+encuentran
+version
+temporada
+cabo
+premio
+miembros
+madrid
+iglesia
+original
+union
+frances
+francesa
+exito
+superior
+construccion
+unico
+principalmente
+antiguo
+recibio
+porque
+pelicula
+libro
+alli
+informacion
+direccion
+igual
+densidad
+grande
+sino
+debe
+campo
+hombres
+banda
+compuesto
+acuerdo
+seria
+politico
+anterior
+relacion
+proyecto
+decada
+ello
+seis
+alta
+menor
+estilo
+metros
+principios
+cuerpo
+resto
+posicion
+meses
+realizo
+incluyendo
+datos
+ahora
+cultura
+mujeres
+categoria
+situada
+servicio
+york
+programa
+sede
+militar
+especial
+comunidad
+perteneciente
+especialmente
+unas
+antigua
+profesional
+sociedad
+interior
+ellas
+habian
+nacido
+coordenadas
+alemania
+instituto
+obtuvo
+pedro
+natural
+descripcion
+local
+trata
+proceso
+social
+conjunto
+organizacion
+costa
+pequeno
+incluye
+gracias
+vista
+album
+actualidad
+compania
+aparece
+estaban
+participo
+propia
+estar
+tercera
+honor
+formacion
+ultimos
+comun
+color
+joven
+llegar
+personal
+italia
+nombrado
+revista
+nunca
+diversos
+papel
+madre
+arte
+pacifico
+hombre
+john
+dicho
+unica
+pues
+tercer
+grupos
+decir
+manuel
+especies
+creacion
+isla
+situado
+referencia
+cuyo
+primeras
+fecha
+posible
+modo
+interes
+hijos
+lugares
+derecho
+unido
+canciones
+pequena
+buenos
+cine
+situacion
+mujer
+poblacional
+cancion
+muestra
+gano
+medios
+territorio
+ubicada
+resultado
+movimiento
+millones
+problemas
+llevo
+actividad
+paris
+blanco
+martin
+miguel
+hija
+publica
+servicios
+chile
+nuevos
+altura
+civil
+existen
+algo
+autor
+principio
+blancos
+empresa
+plantas
+hermano
+ejercito
+diversas
+ocasiones
+aproximadamente
+libre
+partidos
+dado
+musical
+campeonato
+regreso
+trayectoria
+puntos
+partes
+apoyo
+baja
+cantidad
+lleva
+tenian
+dicha
+aleman
+nacionales
+causa
+victoria
+existe
+nuevas
+posee
+isbn
+teatro
+control
+geografica
+personajes
+premios
+ciudades
+cuya
+asociacion
+murio
+juegos
+similar
+clubes
+santiago
+participacion
+presencia
+seleccion
+presenta
+decidio
+museo
+volvio
+siete
+liga
+fundacion
+logro
+actividades
+equipos
+raza
+espacio
+fuerza
+objetivo
+cultural
+fuerte

[3/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/resources/common_tokens/en
--
diff --git a/tika-eval/src/main/resources/common_tokens/en 
b/tika-eval/src/main/resources/common_tokens/en
new file mode 100644
index 000..7426945
--- /dev/null
+++ b/tika-eval/src/main/resources/common_tokens/en
@@ -0,0 +1,2 @@
+with
+from
+which
+that
+also
+this
+were
+first
+other
+after
+been
+have
+when
+their
+more
+there
+into
+time
+over
+they
+during
+years
+most
+known
+only
+some
+made
+including
+___url___
+between
+under
+where
+about
+part
+later
+many
+three
+history
+such
+used
+then
+than
+united
+well
+while
+both
+being
+early
+states
+through
+year
+american
+became
+them
+these
+name
+called
+however
+before
+since
+would
+several
+until
+world
+second
+people
+following
+same
+high
+city
+area
+born
+four
+number
+life
+national
+family
+based
+north
+state
+named
+south
+those
+although
+because
+another
+work
+long
+like
+around
+each
+according
+former
+place
+along
+major
+line
+john
+still
+general
+large
+group
+small
+began
+school
+found
+will
+within
+located
+much
+west
+include
+often
+back
+very
+five
+last
+could
+present
+home
+against
+march
+main
+together
+june
+public
+series
+january
+october
+september
+great
+even
+july
+among
+every
+took
+included
+left
+late
+april
+best
+what
+just
+east
+system
+century
+down
+order
+times
+original
+august
+near
+december
+white
+member
+november
+become
+local
+house
+university
+total
+held
+third
+children
+given
+different
+government
+make
+various
+having
+death
+land
+international
+without
+british
+population
+received
+company
+though
+died
+using
+members
+english
+again
+married
+february
+county
+town
+built
+single
+considered
+point
+created
+came
+service
+served
+popular
+next
+take
+established
+period
+similar
+once
+others
+originally
+short
+central
+said
+york
+full
+career
+side
+making
+further
+published
+living
+released
+moved
+common
+development
+water
+never
+modern
+important
+show
+power
+below
+book
+went
+result
+country
+support
+little
+music
+less
+example
+role
+continued
+played
+produced
+written
+western
+addition
+upon
+film
+days
+final
+developed
+river
+size
+term
+throughout
+president
+post
+formed
+right
+black
+special
+started
+half
+current
+either
+community
+founded
+young
+eventually
+instead
+usually
+good
+center
+office
+seen
+thus
+taken
+control
+lost
+sometimes
+works
+rather
+open
+william
+free
+father
+returned
+french
+average
+almost
+college
+does
+political
+level
+education
+district
+notable
+james
+live
+older
+followed
+george
+england
+version
+wife
+television
+across
+king
+party
+position
+despite
+female
+northern
+production
+working
+described
+himself
+building
+southern
+case
+seven
+america
+play
+available
+includes
+close
+million
+team
+wrote
+today
+list
+largest
+areas
+should
+record
+whose
+above
+region
+park
+worked
+generally
+appeared
+especially
+itself
+middle
+away
+london
+native
+german
+leading
+remained
+months
+joined
+square
+least
+personal
+events
+military
+gave
+outside
+road
+return
+beginning
+david
+currently
+refer
+site
+units
+robert
+business
+lead
+alone
+parts
+possible
+provided
+station
+field
+soon
+official
+class
+church
+opened
+europe
+force
+features
+union
+able
+change
+army
+must
+related
+european
+eastern
+royal
+replaced
+story
+help
+race
+individuals
+association
+brought
+families
+means
+street
+come
+independent
+range
+summer
+involved
+society
+designed
+eight
+character
+referred
+changed
+announced
+light
+award
+significant
+ever
+kingdom
+council
+famous
+earlier
+night
+charles
+island
+private
+previously
+future
+introduced
+process
+services
+type
+added
+geography
+recorded
+director
+project
+provide
+successful
+france
+uses
+lower
+spread
+sold
+information
+husband
+court
+previous
+social
+language
+canada
+program
+limited
+village
+human
+african
+fact
+particularly
+hall
+game
+taking
+interest
+design
+culture
+action
+chief
+research
+completed
+census
+real
+prior
+numerous
+band
+paul
+civil
+lived
+media
+song
+allowed
+recent
+additional
+department
+season
+construction
+whom
+success
+higher
+radio
+longer
+complete
+records
+featured
+initially
+primary
+housing
+places
+certain
+club
+terms
+battle
+groups
+thomas
+behind
+hand
+rock
+better
+board
+already
+traditional
+finally
+particular
+decided
+release
+placed
+associated
+required
+entire
+always
+notes
+standard
+give
+forces
+science
+someone
+mother
+front
+sent
+mostly
+past
+fire
+here
+approximately
+rest
+elected
+active
+star
+performance
+remains
+natural
+income
+wide
+shows
+space
+location
+professional
+start
+thought
+fourth
+michael
+density
+study
+larger
+california
+performed
+germany
+richard
+cross
+playing
+caused
+daughter
+turn
+leader
+division
+word
+brother
+nearly
+love
+commercial
+appointed
+player

[4/4] tika git commit: TIKA-1332 -- add English/Spanish common tokens, fix logging

2017-02-16 Thread tallison
TIKA-1332 -- add English/Spanish common tokens, fix logging


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dc2dcd4c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dc2dcd4c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dc2dcd4c

Branch: refs/heads/master
Commit: dc2dcd4ccc7bca640bb362f72729d0b6ba22a890
Parents: a2d214c
Author: tballison 
Authored: Thu Feb 16 20:13:07 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 20:13:07 2017 -0500

--
 .../org/apache/tika/eval/AbstractProfiler.java  |17 +-
 .../org/apache/tika/eval/ExtractProfiler.java   |24 +-
 .../java/org/apache/tika/eval/TikaEvalCLI.java  |22 +-
 .../tika/eval/batch/EvalConsumersBuilder.java   | 2 +-
 .../eval/batch/SingleFileConsumerBuilder.java   |18 +-
 .../eval/tokens/CommonTokenCountManager.java|75 +-
 tika-eval/src/main/resources/common_tokens/en   | 2 +
 tika-eval/src/main/resources/common_tokens/es   | 19997 
 tika-eval/src/main/resources/log4j.properties   |11 +
 .../resources/tika-eval-comparison-config.xml   | 6 +-
 .../resources/tika-eval-profiler-config.xml | 7 +-
 .../apache/tika/eval/SimpleComparerTest.java| 2 +-
 .../org/apache/tika/eval/TikaEvalCLITest.java   | 2 +-
 tika-eval/src/test/resources/common_tokens/en   | 8 +
 tika-eval/src/test/resources/common_tokens/es   |10 +
 .../src/test/resources/common_tokens/zh-cn  | 8 +
 .../src/test/resources/common_tokens/zh-tw  | 8 +
 tika-eval/src/test/resources/commontokens/en| 8 -
 tika-eval/src/test/resources/commontokens/es|10 -
 tika-eval/src/test/resources/commontokens/zh-cn | 8 -
 tika-eval/src/test/resources/commontokens/zh-tw | 8 -
 tika-eval/src/test/resources/log4j.properties   |11 -
 .../src/test/resources/log4j_process.properties |11 -
 ...ingle-file-profiler-crawl-extract-config.xml | 4 +-
 .../single-file-profiler-crawl-input-config.xml | 4 +-
 25 files changed, 40143 insertions(+), 138 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/dc2dcd4c/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 24f7358..daa964a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -158,6 +158,11 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
 final LanguageIDWrapper langIder;
 protected IDBWriter writer;
 
+/**
+ *
+ * @param p path to the common_tokens directory.  If this is null, try to 
load from classPath
+ * @throws IOException
+ */
 public static void loadCommonTokens(Path p) throws IOException {
 commonTokenCountManager = new CommonTokenCountManager(p);
 }
@@ -536,29 +541,29 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
 /**
  *
  * @param metadata
- * @param extractDir
+ * @param extracts
  * @return evalfilepaths for files if crawling an extract directory
  */
 protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
- Path extractDir) {
+ Path extracts) {
 String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
 Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
 Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
 //just try slapping the relextractfilepath on the extractdir
-Path extractFile = extractDir.resolve(relExtractFilePath);
+Path extractFile = extracts.resolve(relExtractFilePath);
 if (! Files.isRegularFile(extractFile)) {
 //if that doesn't work, try to find the right extract file.
 //This is necessary if crawling extractsA and trying to find a 
file in
 //extractsB that is not in the same format: json vs txt or 
compressed
-extractFile = findFile(extractDir, relativeSourceFilePath);
+extractFile = findFile(extracts, relativeSourceFilePath);
 }
 return new EvalFilePaths(relativeSourceFilePath, extractFile);
 }
 //call this if the crawler is crawling through the src directory
 protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path 
srcDir,
- Path extractDir) {
+ Path extracts) {

[1/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/2.x 61532258f -> 81150859b


http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/log4j.properties
--
diff --git a/tika-eval/src/main/resources/log4j.properties 
b/tika-eval/src/main/resources/log4j.properties
new file mode 100644
index 000..925f9f2
--- /dev/null
+++ b/tika-eval/src/main/resources/log4j.properties
@@ -0,0 +1,11 @@
+
+log4j.rootLogger=WARN,A1
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/tika-eval-comparison-config.xml
--
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml 
b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index 2c51616..8070672 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -28,8 +28,6 @@
 >
 
 
-
 
 
-
 
 
 
@@ -72,7 +68,7 @@
crawlingInputDir="false"
minJsonFileSizeBytes="-1"
maxJsonFileSizeBytes="200"
-   commonTokens="resources/commontokens"
+   commonTokens="resources/common_tokens"
 />
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/tika-eval-profiler-config.xml
--
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml 
b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index bd94b25..be7adf4 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -27,16 +27,13 @@
 timeoutThresholdMillis="30">
 
 
-
-
 
 
 
-
 
@@ -66,7 +63,7 @@
 
 
+   commonTokens="resources/common_tokens"/>
 
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
--
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 72e8008..6d4d4ef 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -59,7 +59,7 @@ public class SimpleComparerTest extends TikaTest {
 Paths.get("extractsA"), Paths.get("extractsB"),
 writer, -1, -1,
 ExtractReader.ALTER_METADATA_LIST.AS_IS);
-
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/commontokens").toPath());
+
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath());
 LanguageIDWrapper.loadBuiltInModels();
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
--
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
index c358149..ff0961c 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -30,7 +30,7 @@ public class TikaEvalCLITest {
 public void testBasic() throws Exception {
 List args = new ArrayList<>();
 args.add("Profile");
-args.add("-extractDir");
+args.add("-extracts");
 args.add("tika");
 args.add("-db");
 args.add("mydb");

http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/resources/common_tokens/en
--
diff --git a/tika-eval/src/test/resources/common_tokens/en 
b/tika-eval/src/test/resources/common_tokens/en
new file mode 100644
index 000..8d442fe
--- /dev/null
+++ b/tika-eval/src/test/resources/common_tokens/en
@@ -0,0 +1,8 @@
+the
+of
+and
+a
+or
+#quick
+brown
+fox
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/test/resources/common_tokens/es
--
diff --git a/tika-eval/src/test/resources/common_tokens/es 
b/tika-eval/src/test/resources/common_tokens/es
new file mode 100644
index 000..b9bfd03
--- /dev/null
+++ 

[3/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/common_tokens/en
--
diff --git a/tika-eval/src/main/resources/common_tokens/en 
b/tika-eval/src/main/resources/common_tokens/en
new file mode 100644
index 000..7426945
--- /dev/null
+++ b/tika-eval/src/main/resources/common_tokens/en
@@ -0,0 +1,2 @@
+with
+from
+which
+that
+also
+this
+were
+first
+other
+after
+been
+have
+when
+their
+more
+there
+into
+time
+over
+they
+during
+years
+most
+known
+only
+some
+made
+including
+___url___
+between
+under
+where
+about
+part
+later
+many
+three
+history
+such
+used
+then
+than
+united
+well
+while
+both
+being
+early
+states
+through
+year
+american
+became
+them
+these
+name
+called
+however
+before
+since
+would
+several
+until
+world
+second
+people
+following
+same
+high
+city
+area
+born
+four
+number
+life
+national
+family
+based
+north
+state
+named
+south
+those
+although
+because
+another
+work
+long
+like
+around
+each
+according
+former
+place
+along
+major
+line
+john
+still
+general
+large
+group
+small
+began
+school
+found
+will
+within
+located
+much
+west
+include
+often
+back
+very
+five
+last
+could
+present
+home
+against
+march
+main
+together
+june
+public
+series
+january
+october
+september
+great
+even
+july
+among
+every
+took
+included
+left
+late
+april
+best
+what
+just
+east
+system
+century
+down
+order
+times
+original
+august
+near
+december
+white
+member
+november
+become
+local
+house
+university
+total
+held
+third
+children
+given
+different
+government
+make
+various
+having
+death
+land
+international
+without
+british
+population
+received
+company
+though
+died
+using
+members
+english
+again
+married
+february
+county
+town
+built
+single
+considered
+point
+created
+came
+service
+served
+popular
+next
+take
+established
+period
+similar
+once
+others
+originally
+short
+central
+said
+york
+full
+career
+side
+making
+further
+published
+living
+released
+moved
+common
+development
+water
+never
+modern
+important
+show
+power
+below
+book
+went
+result
+country
+support
+little
+music
+less
+example
+role
+continued
+played
+produced
+written
+western
+addition
+upon
+film
+days
+final
+developed
+river
+size
+term
+throughout
+president
+post
+formed
+right
+black
+special
+started
+half
+current
+either
+community
+founded
+young
+eventually
+instead
+usually
+good
+center
+office
+seen
+thus
+taken
+control
+lost
+sometimes
+works
+rather
+open
+william
+free
+father
+returned
+french
+average
+almost
+college
+does
+political
+level
+education
+district
+notable
+james
+live
+older
+followed
+george
+england
+version
+wife
+television
+across
+king
+party
+position
+despite
+female
+northern
+production
+working
+described
+himself
+building
+southern
+case
+seven
+america
+play
+available
+includes
+close
+million
+team
+wrote
+today
+list
+largest
+areas
+should
+record
+whose
+above
+region
+park
+worked
+generally
+appeared
+especially
+itself
+middle
+away
+london
+native
+german
+leading
+remained
+months
+joined
+square
+least
+personal
+events
+military
+gave
+outside
+road
+return
+beginning
+david
+currently
+refer
+site
+units
+robert
+business
+lead
+alone
+parts
+possible
+provided
+station
+field
+soon
+official
+class
+church
+opened
+europe
+force
+features
+union
+able
+change
+army
+must
+related
+european
+eastern
+royal
+replaced
+story
+help
+race
+individuals
+association
+brought
+families
+means
+street
+come
+independent
+range
+summer
+involved
+society
+designed
+eight
+character
+referred
+changed
+announced
+light
+award
+significant
+ever
+kingdom
+council
+famous
+earlier
+night
+charles
+island
+private
+previously
+future
+introduced
+process
+services
+type
+added
+geography
+recorded
+director
+project
+provide
+successful
+france
+uses
+lower
+spread
+sold
+information
+husband
+court
+previous
+social
+language
+canada
+program
+limited
+village
+human
+african
+fact
+particularly
+hall
+game
+taking
+interest
+design
+culture
+action
+chief
+research
+completed
+census
+real
+prior
+numerous
+band
+paul
+civil
+lived
+media
+song
+allowed
+recent
+additional
+department
+season
+construction
+whom
+success
+higher
+radio
+longer
+complete
+records
+featured
+initially
+primary
+housing
+places
+certain
+club
+terms
+battle
+groups
+thomas
+behind
+hand
+rock
+better
+board
+already
+traditional
+finally
+particular
+decided
+release
+placed
+associated
+required
+entire
+always
+notes
+standard
+give
+forces
+science
+someone
+mother
+front
+sent
+mostly
+past
+fire
+here
+approximately
+rest
+elected
+active
+star
+performance
+remains
+natural
+income
+wide
+shows
+space
+location
+professional
+start
+thought
+fourth
+michael
+density
+study
+larger
+california
+performed
+germany
+richard
+cross
+playing
+caused
+daughter
+turn
+leader
+division
+word
+brother
+nearly
+love
+commercial
+appointed
+player

[4/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging

2017-02-16 Thread tallison
TIKA-1332 -- add English Spanish common tokens;  fix logging


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/81150859
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/81150859
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/81150859

Branch: refs/heads/2.x
Commit: 81150859bdb25fe7faec575f5b916c8efad963cb
Parents: 6153225
Author: tballison 
Authored: Thu Feb 16 20:12:01 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 20:12:01 2017 -0500

--
 .../org/apache/tika/eval/AbstractProfiler.java  |17 +-
 .../org/apache/tika/eval/ExtractProfiler.java   |24 +-
 .../java/org/apache/tika/eval/TikaEvalCLI.java  |22 +-
 .../tika/eval/batch/EvalConsumersBuilder.java   | 2 +-
 .../eval/batch/SingleFileConsumerBuilder.java   |18 +-
 .../eval/tokens/CommonTokenCountManager.java|75 +-
 tika-eval/src/main/resources/common_tokens/en   | 2 +
 tika-eval/src/main/resources/common_tokens/es   | 19997 
 tika-eval/src/main/resources/log4j.properties   |11 +
 .../resources/tika-eval-comparison-config.xml   | 6 +-
 .../resources/tika-eval-profiler-config.xml | 7 +-
 .../apache/tika/eval/SimpleComparerTest.java| 2 +-
 .../org/apache/tika/eval/TikaEvalCLITest.java   | 2 +-
 tika-eval/src/test/resources/common_tokens/en   | 8 +
 tika-eval/src/test/resources/common_tokens/es   |10 +
 .../src/test/resources/common_tokens/zh-cn  | 8 +
 .../src/test/resources/common_tokens/zh-tw  | 8 +
 tika-eval/src/test/resources/commontokens/en| 8 -
 tika-eval/src/test/resources/commontokens/es|10 -
 tika-eval/src/test/resources/commontokens/zh-cn | 8 -
 tika-eval/src/test/resources/commontokens/zh-tw | 8 -
 tika-eval/src/test/resources/log4j.properties   |11 -
 .../src/test/resources/log4j_process.properties |11 -
 ...ingle-file-profiler-crawl-extract-config.xml | 4 +-
 .../single-file-profiler-crawl-input-config.xml | 4 +-
 25 files changed, 40143 insertions(+), 138 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 24f7358..daa964a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -158,6 +158,11 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
 final LanguageIDWrapper langIder;
 protected IDBWriter writer;
 
+/**
+ *
+ * @param p path to the common_tokens directory.  If this is null, try to 
load from classPath
+ * @throws IOException
+ */
 public static void loadCommonTokens(Path p) throws IOException {
 commonTokenCountManager = new CommonTokenCountManager(p);
 }
@@ -536,29 +541,29 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
 /**
  *
  * @param metadata
- * @param extractDir
+ * @param extracts
  * @return evalfilepaths for files if crawling an extract directory
  */
 protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
- Path extractDir) {
+ Path extracts) {
 String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
 Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
 Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
 //just try slapping the relextractfilepath on the extractdir
-Path extractFile = extractDir.resolve(relExtractFilePath);
+Path extractFile = extracts.resolve(relExtractFilePath);
 if (! Files.isRegularFile(extractFile)) {
 //if that doesn't work, try to find the right extract file.
 //This is necessary if crawling extractsA and trying to find a 
file in
 //extractsB that is not in the same format: json vs txt or 
compressed
-extractFile = findFile(extractDir, relativeSourceFilePath);
+extractFile = findFile(extracts, relativeSourceFilePath);
 }
 return new EvalFilePaths(relativeSourceFilePath, extractFile);
 }
 //call this if the crawler is crawling through the src directory
 protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path 
srcDir,
- Path extractDir) {
+ Path extracts) {
  

[2/4] tika git commit: TIKA-1332 -- add English Spanish common tokens; fix logging

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/81150859/tika-eval/src/main/resources/common_tokens/es
--
diff --git a/tika-eval/src/main/resources/common_tokens/es 
b/tika-eval/src/main/resources/common_tokens/es
new file mode 100644
index 000..2889e7c
--- /dev/null
+++ b/tika-eval/src/main/resources/common_tokens/es
@@ -0,0 +1,19997 @@
+como
+para
+tambien
+esta
+entre
+este
+desde
+anos
+hasta
+parte
+donde
+sobre
+___url___
+durante
+pero
+historia
+tiene
+vease
+primera
+nombre
+ciudad
+despues
+cuando
+otros
+gran
+encuentra
+cual
+familia
+primer
+estado
+mismo
+estados
+solo
+siglo
+tres
+otras
+unidos
+segun
+ademas
+habia
+forma
+puede
+fueron
+nacional
+bajo
+espana
+aunque
+poblacion
+lugar
+junto
+sido
+mayor
+tras
+todo
+oficial
+ingles
+tiempo
+siendo
+nueva
+todos
+estaba
+general
+antes
+vida
+tenia
+nuevo
+hacia
+grupo
+tanto
+segunda
+quien
+cada
+algunos
+mientras
+personas
+embargo
+varios
+norte
+espanol
+julio
+habitantes
+debido
+total
+estos
+provincia
+luego
+estadounidense
+tuvo
+bibliografia
+eran
+mundo
+contra
+otro
+centro
+region
+cuatro
+mayo
+principal
+entonces
+pais
+conocido
+universidad
+trabajo
+cuenta
+unos
+septiembre
+enero
+octubre
+guerra
+marzo
+serie
+segundo
+junio
+diciembre
+final
+otra
+dentro
+agosto
+especie
+abril
+paso
+noviembre
+ellos
+internacional
+tierra
+actualmente
+numero
+varias
+esto
+algunas
+hace
+juan
+poco
+hizo
+carrera
+estas
+traves
+estan
+febrero
+jose
+biografia
+muchos
+partir
+medio
+demografia
+estudio
+grandes
+mejor
+agua
+misma
+sitio
+largo
+caso
+equipo
+tarde
+siguiente
+obra
+tipo
+gobierno
+publico
+todas
+bien
+ultimo
+llamado
+posteriormente
+primeros
+geografia
+mundial
+pueden
+casa
+cualquier
+sistema
+maria
+zona
+importante
+momento
+desarrollo
+real
+frente
+epoca
+hijo
+menos
+comenzo
+hecho
+actual
+ella
+toda
+obras
+muerte
+diferentes
+termino
+departamento
+orden
+central
+partido
+titulo
+municipio
+veces
+cerca
+argentina
+casi
+base
+principales
+origen
+lista
+cuales
+dias
+tienen
+francia
+periodo
+edad
+siguientes
+presidente
+padre
+mayoria
+manera
+finalmente
+pagina
+distrito
+estudios
+cinco
+localidad
+hacer
+punto
+alrededor
+superficie
+haber
+capital
+club
+carlos
+nivel
+censo
+santa
+pueblo
+mexico
+poder
+espanola
+condado
+finales
+ante
+ubicado
+nacio
+distribucion
+cambio
+america
+republica
+ambos
+fuera
+conocida
+tener
+genero
+television
+director
+llego
+puesto
+mediante
+ejemplo
+musica
+caracteristicas
+ultima
+luis
+pesar
+europa
+siempre
+estuvo
+lado
+llamada
+escuela
+miembro
+cargo
+paises
+importantes
+inicio
+area
+linea
+primero
+muchas
+alto
+reino
+futbol
+mucho
+edicion
+incluso
+politica
+publicado
+propio
+produccion
+oficina
+francisco
+oeste
+popular
+convirtio
+notas
+division
+media
+antonio
+encuentran
+version
+temporada
+cabo
+premio
+miembros
+madrid
+iglesia
+original
+union
+frances
+francesa
+exito
+superior
+construccion
+unico
+principalmente
+antiguo
+recibio
+porque
+pelicula
+libro
+alli
+informacion
+direccion
+igual
+densidad
+grande
+sino
+debe
+campo
+hombres
+banda
+compuesto
+acuerdo
+seria
+politico
+anterior
+relacion
+proyecto
+decada
+ello
+seis
+alta
+menor
+estilo
+metros
+principios
+cuerpo
+resto
+posicion
+meses
+realizo
+incluyendo
+datos
+ahora
+cultura
+mujeres
+categoria
+situada
+servicio
+york
+programa
+sede
+militar
+especial
+comunidad
+perteneciente
+especialmente
+unas
+antigua
+profesional
+sociedad
+interior
+ellas
+habian
+nacido
+coordenadas
+alemania
+instituto
+obtuvo
+pedro
+natural
+descripcion
+local
+trata
+proceso
+social
+conjunto
+organizacion
+costa
+pequeno
+incluye
+gracias
+vista
+album
+actualidad
+compania
+aparece
+estaban
+participo
+propia
+estar
+tercera
+honor
+formacion
+ultimos
+comun
+color
+joven
+llegar
+personal
+italia
+nombrado
+revista
+nunca
+diversos
+papel
+madre
+arte
+pacifico
+hombre
+john
+dicho
+unica
+pues
+tercer
+grupos
+decir
+manuel
+especies
+creacion
+isla
+situado
+referencia
+cuyo
+primeras
+fecha
+posible
+modo
+interes
+hijos
+lugares
+derecho
+unido
+canciones
+pequena
+buenos
+cine
+situacion
+mujer
+poblacional
+cancion
+muestra
+gano
+medios
+territorio
+ubicada
+resultado
+movimiento
+millones
+problemas
+llevo
+actividad
+paris
+blanco
+martin
+miguel
+hija
+publica
+servicios
+chile
+nuevos
+altura
+civil
+existen
+algo
+autor
+principio
+blancos
+empresa
+plantas
+hermano
+ejercito
+diversas
+ocasiones
+aproximadamente
+libre
+partidos
+dado
+musical
+campeonato
+regreso
+trayectoria
+puntos
+partes
+apoyo
+baja
+cantidad
+lleva
+tenian
+dicha
+aleman
+nacionales
+causa
+victoria
+existe
+nuevas
+posee
+isbn
+teatro
+control
+geografica
+personajes
+premios
+ciudades
+cuya
+asociacion
+murio
+juegos
+similar
+clubes
+santiago
+participacion
+presencia
+seleccion
+presenta
+decidio
+museo
+volvio
+siete
+liga
+fundacion
+logro
+actividades
+equipos
+raza
+espacio
+fuerza
+objetivo
+cultural
+fuerte

tika git commit: TIKA-1332 -- fix analyzer chain for common tokens, clean up UTF-8 references

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master 6c6b77b41 -> a2d214c71


TIKA-1332 -- fix analyzer chain for common tokens, clean up UTF-8 references


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a2d214c7
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a2d214c7
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a2d214c7

Branch: refs/heads/master
Commit: a2d214c71602f4f5a84635adc38c43182a39a390
Parents: 6c6b77b
Author: tballison 
Authored: Thu Feb 16 15:41:53 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 15:41:53 2017 -0500

--
 .../org/apache/tika/eval/io/ExtractReader.java  |  3 +-
 .../tika/eval/tokens/AnalyzerManager.java   |  3 +-
 .../apache/tika/eval/tokens/TokenIntPair.java   |  4 +--
 .../src/main/resources/lucene-analyzers.json| 32 +---
 4 files changed, 33 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
index cd90f76..2631f44 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -5,6 +5,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
@@ -78,7 +79,7 @@ public class ExtractReader {
 LOGGER.warn("Can't yet process compression of type: 
"+fileSuffixes.compression);
 }
 }
-reader = new BufferedReader(new InputStreamReader(is, 
"UTF-8"));
+reader = new BufferedReader(new InputStreamReader(is, 
StandardCharsets.UTF_8));
 
 if (fileSuffixes.txtOrJson.equals("json")) {
 metadataList = JsonMetadataList.fromJson(reader);

http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
index db6ae26..774b19a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.charset.StandardCharsets;
 import java.util.Map;
 
 import com.google.gson.Gson;
@@ -47,7 +48,7 @@ public class AnalyzerManager {
 
 public static AnalyzerManager newInstance() throws IOException {
 InputStream is = 
AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json");
-Reader reader = new InputStreamReader(is, "UTF-8");
+Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
 GsonBuilder builder = new GsonBuilder();
 builder.registerTypeHierarchyAdapter(Map.class, new 
AnalyzerDeserializer());
 Gson gson = builder.create();

http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
index 4b57d25..a924f07 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
@@ -17,8 +17,6 @@
 package org.apache.tika.eval.tokens;
 
 
-import org.jetbrains.annotations.NotNull;
-
 public class TokenIntPair implements Comparable {
 
 final String token;
@@ -63,7 +61,7 @@ public class TokenIntPair implements Comparable 
{
  * @return comparison
  */
 @Override
-public int compareTo(@NotNull TokenIntPair o) {
+public int compareTo(TokenIntPair o) {
 if (this.value > o.value) {
 return -1;
 } else if (this.value < o.value) {

http://git-wip-us.apache.org/repos/asf/tika/blob/a2d214c7/tika-eval/src/main/resources/lucene-analyzers.json
--
diff --git 

tika git commit: TIKA-1332 3rd time's the charm. Fix dependencies with IOUtils.

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/2.x 44612ae40 -> 61532258f


TIKA-1332 3rd time's the charm.  Fix dependencies with IOUtils.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/61532258
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/61532258
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/61532258

Branch: refs/heads/2.x
Commit: 61532258f2ff44787050f0f3a0bb8ba17d8e50b0
Parents: 44612ae
Author: tballison 
Authored: Thu Feb 16 14:41:13 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 14:41:13 2017 -0500

--
 .../main/java/org/apache/tika/eval/XMLErrorLogUpdater.java| 2 +-
 tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java   | 2 +-
 tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java | 2 +-
 .../src/main/java/org/apache/tika/eval/io/ExtractReader.java  | 2 +-
 .../src/main/java/org/apache/tika/eval/io/XMLLogReader.java   | 7 ---
 .../main/java/org/apache/tika/eval/tokens/TokenIntPair.java   | 4 +---
 6 files changed, 9 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java 
b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
index 9a7e7aa..eaaf228 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
@@ -32,6 +32,7 @@ import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.sql.Statement;
 
+import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.log4j.Level;
 import org.apache.tika.eval.db.Cols;
 import org.apache.tika.eval.db.DBUtil;
@@ -40,7 +41,6 @@ import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.eval.io.XMLLogMsgHandler;
 import org.apache.tika.eval.io.XMLLogReader;
 import org.apache.tika.eval.reports.ResultsReporter;
-import org.apache.tika.io.IOExceptionWithCause;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 

http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
index 1efa48a..d99798d 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
@@ -31,8 +31,8 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.log4j.Logger;
-import org.apache.tika.io.IOExceptionWithCause;
 
 public abstract class DBUtil {
 

http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
index db4cd04..383f25c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
@@ -25,6 +25,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicLong;
 
+import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.log4j.Logger;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.eval.db.ColInfo;
@@ -32,7 +33,6 @@ import org.apache.tika.eval.db.Cols;
 import org.apache.tika.eval.db.DBUtil;
 import org.apache.tika.eval.db.MimeBuffer;
 import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.io.IOExceptionWithCause;
 
 /**
  * This is still in its early stages.  The idea is to

http://git-wip-us.apache.org/repos/asf/tika/blob/61532258/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
index cd90f76..f703408 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -15,9 +15,9 @@ import java.util.regex.Pattern;
 import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import 

tika git commit: TIKA-1332 -- clean up commons-io version mgmt

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master d194ba402 -> 6c6b77b41


TIKA-1332 -- clean up commons-io version mgmt


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6c6b77b4
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6c6b77b4
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6c6b77b4

Branch: refs/heads/master
Commit: 6c6b77b4159d4e7bbebd883cb52f2160be9cc5a6
Parents: d194ba4
Author: tballison 
Authored: Thu Feb 16 13:39:26 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 13:39:26 2017 -0500

--
 tika-eval/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/6c6b77b4/tika-eval/pom.xml
--
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index f1758cc..8bc7680 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -87,7 +87,7 @@
 
 commons-io
 commons-io
-2.4
+${commons.io.version}
 
 
 



tika git commit: TIKA-1332 fix pom for 2.0

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/2.x 0d04b499a -> 44612ae40


TIKA-1332 fix pom for 2.0


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/44612ae4
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/44612ae4
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/44612ae4

Branch: refs/heads/2.x
Commit: 44612ae405d1342661387f74320e13c96301754b
Parents: 0d04b49
Author: tballison 
Authored: Thu Feb 16 13:37:26 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 13:37:26 2017 -0500

--
 tika-eval/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/44612ae4/tika-eval/pom.xml
--
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 9167742..ce85229 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -35,7 +35,7 @@
 
 org.apache.tika
 tika-parent
-1.15-SNAPSHOT
+2.0-SNAPSHOT
 ../tika-parent/pom.xml
 
 
@@ -87,7 +87,7 @@
 
 commons-io
 commons-io
-2.4
+${commons.io.version}
 
 
 



tika git commit: TIKA-1332 downgrade to Lucene 5.x so that this can run w/ Java 7

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/2.x 69dd0328b -> 0d04b499a


TIKA-1332 downgrade to Lucene 5.x so that this can run w/ Java 7


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0d04b499
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0d04b499
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0d04b499

Branch: refs/heads/2.x
Commit: 0d04b499a6c305c6c0656f37abfd6f78440ea309
Parents: 69dd032
Author: tballison 
Authored: Thu Feb 16 12:59:28 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 12:59:28 2017 -0500

--
 tika-eval/pom.xml | 3 ++-
 .../org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java  | 2 +-
 .../tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/0d04b499/tika-eval/pom.xml
--
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index ee0940c..9167742 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -26,7 +26,8 @@
 4.0.0
 
 1.3.1 
-6.2.1
+
+5.5.3
 3.16-beta2
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/0d04b499/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
index fb72e84..2c046ad 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
@@ -19,9 +19,9 @@ package org.apache.tika.eval.tokens;
 import java.io.IOException;
 import java.util.Map;
 
-import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**

http://git-wip-us.apache.org/repos/asf/tika/blob/0d04b499/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
index 31fa866..549e85d 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -3,11 +3,11 @@ package org.apache.tika.eval.tokens;
 import java.io.IOException;
 import java.util.Map;
 
-import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**



tika git commit: TIKA-1332 -- downgrade Lucene to 5.x to allow for Java 7

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master 506b57256 -> d194ba402


TIKA-1332 -- downgrade Lucene to 5.x to allow for Java 7


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d194ba40
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d194ba40
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d194ba40

Branch: refs/heads/master
Commit: d194ba4022dffa61cad2a12ea0092f6ec00588d2
Parents: 506b572
Author: tballison 
Authored: Thu Feb 16 12:57:22 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 12:57:22 2017 -0500

--
 tika-eval/pom.xml | 3 ++-
 .../org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java  | 2 +-
 .../tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/d194ba40/tika-eval/pom.xml
--
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index ec2c18b..f1758cc 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -26,7 +26,8 @@
 4.0.0
 
 1.3.1 
-6.2.1
+
+5.5.3
 3.16-beta2
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/d194ba40/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
index fb72e84..2c046ad 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
@@ -19,9 +19,9 @@ package org.apache.tika.eval.tokens;
 import java.io.IOException;
 import java.util.Map;
 
-import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**

http://git-wip-us.apache.org/repos/asf/tika/blob/d194ba40/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
index 31fa866..549e85d 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -3,11 +3,11 @@ package org.apache.tika.eval.tokens;
 import java.io.IOException;
 import java.util.Map;
 
-import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 
 /**



tika git commit: TIKA-1332 -- fix one report for eval profiler and clean up whitespace

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master aa7a0c353 -> 506b57256


TIKA-1332 -- fix one report for eval profiler and clean up whitespace


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/506b5725
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/506b5725
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/506b5725

Branch: refs/heads/master
Commit: 506b572560f6c7f44270b55877f110719a7d4b1f
Parents: aa7a0c3
Author: tballison 
Authored: Thu Feb 16 12:33:58 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 12:33:58 2017 -0500

--
 .../src/main/resources/comparison-reports.xml   |  2 +-
 .../src/main/resources/lucene-analyzers.json| 30 +++--
 .../src/main/resources/profile-reports.xml  | 11 ++--
 .../resources/tika-eval-comparison-config.xml   | 65 ++--
 ...ingle-file-profiler-crawl-extract-config.xml |  2 +-
 .../single-file-profiler-crawl-input-config.xml |  2 +-
 6 files changed, 52 insertions(+), 60 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/comparison-reports.xml
--
diff --git a/tika-eval/src/main/resources/comparison-reports.xml 
b/tika-eval/src/main/resources/comparison-reports.xml
index cb7befd..d69cb2a 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -206,7 +206,7 @@
 
 
 
-   

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/lucene-analyzers.json
--
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json 
b/tika-eval/src/main/resources/lucene-analyzers.json
index 268494f..f7141f7 100644
--- a/tika-eval/src/main/resources/lucene-analyzers.json
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -1,12 +1,11 @@
 {
   "analyzers": {
-"general" :
-{
+"general": {
   "charfilters": [
 {
   "factory": "oala.charfilter.MappingCharFilterFactory",
   "params": {
-"mapping" : "/lucene-char-mapping.txt"
+"mapping": "/lucene-char-mapping.txt"
   }
 }
   ],
@@ -22,20 +21,17 @@
 {
   "factory": "oala.cjk.CJKBigramFilterFactory",
   "params": {
-"outputUnigrams" : "false"
+"outputUnigrams": "false"
   }
 }
   ]
-
 },
-
-"alpha" :
-{
+"alpha": {
   "charfilters": [
 {
   "factory": "oala.charfilter.MappingCharFilterFactory",
   "params": {
-"mapping" : "/lucene-char-mapping.txt"
+"mapping": "/lucene-char-mapping.txt"
   }
 }
   ],
@@ -67,7 +63,7 @@
 {
   "factory": "oala.cjk.CJKBigramFilterFactory",
   "params": {
-"outputUnigrams" : "false"
+"outputUnigrams": "false"
   }
 },
 {
@@ -75,33 +71,27 @@
   "params": {}
 }
   ]
-
 },
-"common_tokens" :
-{
+"common_tokens": {
   "tokenizer": {
 "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
 "params": {}
   },
-
   "tokenfilters": [
 {
   "factory": "oala.cjk.CJKBigramFilterFactory",
   "params": {
-"outputUnigrams" : "false"
+"outputUnigrams": "false"
   }
 },
 {
   "factory": 
"org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory",
   "params": {
-"min" : 4,
-"max" : 20
+"min": 4,
+"max": 20
   }
 }
-
   ]
-
 }
-
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/profile-reports.xml
--
diff --git a/tika-eval/src/main/resources/profile-reports.xml 
b/tika-eval/src/main/resources/profile-reports.xml
index 2a94a97..1f9be6a 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -98,7 +98,6 @@
 
 
 
-
 
 
 
-select LANG_ID_1 as DetectedLang, count(1) as cnt
-from contents
-group by LANG_ID_1
-order by cnt desc
+select parse_exception_description, count(1) cnt
+from parse_exceptions e
+join profiles p on p.id = e.id
+join ref_parse_exception_types et on 
et.parse_exception_type_id=e.parse_exception_type_id
+group by parse_exception_description
+order by cnt desc;
 

[5/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java 
b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
new file mode 100644
index 000..5860327
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.ParseException;
+import org.apache.tika.batch.fs.FSBatchProcessCLI;
+import org.apache.tika.eval.reports.ResultsReporter;
+import org.h2.tools.Console;
+
+public class TikaEvalCLI {
+static final String[] tools = {"Profile", "Compare", "Report", "StartDB"};
+
+private static String specifyTools() {
+StringBuilder sb = new StringBuilder();
+sb.append("Must specify one of the following tools in the first 
parameter:\n");
+for (String s : tools) {
+sb.append(s+"\n");
+}
+return sb.toString();
+
+}
+
+private void execute(String[] args) throws Exception {
+String tool = args[0];
+String[] subsetArgs = new String[args.length-1];
+System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
+if (tool.equals("Report")) {
+handleReport(subsetArgs);
+} else if (tool.equals("Compare")) {
+handleCompare(subsetArgs);
+} else if (tool.equals("Profile")) {
+handleProfile(subsetArgs);
+} else if (tool.equals("StartDB")) {
+handleStartDB(subsetArgs);
+} else {
+System.out.println(specifyTools());
+}
+}
+
+private void handleStartDB(String[] args) throws SQLException {
+List argList = new ArrayList<>();
+argList.add("-web");
+Console.main(argList.toArray(new String[argList.size()]));
+while(true) {
+try {
+Thread.sleep(1000);
+} catch (InterruptedException e){
+break;
+}
+}
+}
+
+private void handleProfile(String[] subsetArgs) throws Exception {
+List argList = new ArrayList(Arrays.asList(subsetArgs));
+
+boolean containsBC = false;
+String inputDir = null;
+String extractDir = null;
+String alterExtract = null;
+//confirm there's a batch-config file
+for (int i = 0; i < argList.size(); i++) {
+String arg = argList.get(i);
+if (arg.equals("-bc")) {
+containsBC = true;
+} else if (arg.equals("-inputDir")) {
+if (i+1 >= argList.size()) {
+System.err.println("Must specify directory after 
-inputDir");
+ExtractProfiler.USAGE();
+return;
+}
+inputDir = argList.get(i+1);
+i++;
+} else if (arg.equals("-extractDir")) {
+if (i+1 >= argList.size()) {
+System.err.println("Must specify directory after 
-extractDir");
+ExtractProfiler.USAGE();
+return;
+}
+extractDir = argList.get(i+1);
+i++;
+} else if (arg.equals("-alterExtract")) {
+if (i+1 >= argList.size()) {
+System.err.println("Must specify directory after 
-extractsB");
+ExtractComparer.USAGE();
+return;
+}
+alterExtract = argList.get(i+1);
+i++;
+}
+}
+
+if (alterExtract != null && !alterExtract.equals("as_is") &&
+!alterExtract.equals("concatenate_content") &&
+!alterExtract.equals("first_only")) {
+

[4/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
new file mode 100644
index 000..cd90f76
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -0,0 +1,161 @@
+package org.apache.tika.eval.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class ExtractReader {
+
+public enum ALTER_METADATA_LIST {
+AS_IS,  //leave the metadata list as is
+FIRST_ONLY, //take only the metadata list for the "container" document
+CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into 
the first
+}
+private final static Logger LOGGER = 
LoggerFactory.getLogger(ExtractReader.class);
+TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+public List loadExtract(Path thisFile, ALTER_METADATA_LIST 
alterExtractList) {
+List metadataList = null;
+if (thisFile == null || !Files.isRegularFile(thisFile)) {
+return metadataList;
+}
+Reader reader = null;
+InputStream is = null;
+FileSuffixes fileSuffixes = 
parseSuffixes(thisFile.getFileName().toString());
+if (fileSuffixes.txtOrJson == null) {
+LOGGER.warn("file must end with .txt or .json: 
"+thisFile.getFileName().toString());
+return metadataList;
+}
+
+try {
+is = Files.newInputStream(thisFile);
+if (fileSuffixes.compression != null) {
+if (fileSuffixes.compression.equals("bz2")) {
+is = new BZip2CompressorInputStream(is);
+} else if (fileSuffixes.compression.equals("gz")) {
+is = new GzipCompressorInputStream(is);
+} else if (fileSuffixes.compression.equals("zip")) {
+is = new ZCompressorInputStream(is);
+} else {
+LOGGER.warn("Can't yet process compression of type: 
"+fileSuffixes.compression);
+}
+}
+reader = new BufferedReader(new InputStreamReader(is, 
"UTF-8"));
+
+if (fileSuffixes.txtOrJson.equals("json")) {
+metadataList = JsonMetadataList.fromJson(reader);
+if (alterExtractList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && 
metadataList.size() > 1) {
+while (metadataList.size() > 1) {
+metadataList.remove(metadataList.size()-1);
+}
+} else if 
(alterExtractList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST)
 &&
+metadataList.size() > 1) {
+StringBuilder sb = new StringBuilder();
+Metadata containerMetadata = metadataList.get(0);
+for (int i = 0; i < metadataList.size(); i++) {
+Metadata m = metadataList.get(i);
+String c = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+if (c 

[1/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/2.x 6bfe5d565 -> 5e49c3308


http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
--
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
new file mode 100644
index 000..6ef09de
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
@@ -0,0 +1,5 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog 
1,200 12",
+  "xmpTPg:NPages":2
+}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json
new file mode 100644
index 000..e69de29

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json
new file mode 100644
index 000..e69de29

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
new file mode 100644
index 000..0e2558b
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
@@ -0,0 +1,4 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"El zorro marrón rápido saltó sobre el perro. El zorro 
marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el 
perro"
+}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
new file mode 100644
index 000..5371c87
--- /dev/null
+++ 
b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
@@ -0,0 +1,10 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+  },
+  {
+"Content-Type":"text/plain",
+"X-TIKA:embedded_resource_path":"inner.txt",
+"X-TIKA:content":"attachment contents"
+  }
+]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
new file mode 100644
index 000..5371c87
--- /dev/null
+++ 
b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
@@ -0,0 +1,10 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+  },
+  {
+"Content-Type":"text/plain",
+"X-TIKA:embedded_resource_path":"inner.txt",
+"X-TIKA:content":"attachment contents"
+  }
+]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
new file mode 100644
index 000..18763d1
--- /dev/null
+++ 
b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
@@ -0,0 +1,4 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json
new file mode 

[2/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
--
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
new file mode 100644
index 000..0d925cf
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.batch.fs.FSBatchTestBase;
+import org.apache.tika.eval.db.Cols;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+@Ignore("need to fix tika-batch tests to make this work")
+public class ComparerBatchTest extends FSBatchTestBase {
+
+public final static String COMPARER_PROCESS_CLASS = 
"org.apache.tika.batch.fs.FSBatchProcessCLI";
+
+private static Path dbDir;
+private static Connection conn;
+
+private final static String compJoinCont = "";
+/*ExtractComparer.COMPARISONS_TABLE+" cmp " +
+"join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+
+"on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+
+" = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/
+
+@BeforeClass
+public static void setUp() throws Exception {
+
+File inputRoot = new 
File(ComparerBatchTest.class.getResource("/test-dirs").toURI());
+dbDir = Files.createTempDirectory(inputRoot.toPath(), 
"tika-test-db-dir-");
+Map args = new HashMap<>();
+Path db = FileSystems.getDefault().getPath(dbDir.toString(), 
"comparisons_test");
+args.put("-db", db.toString());
+
+//for debugging, you can use this to select only one file pair to load
+//args.put("-includeFilePat", "file8.*");
+/*
+BatchProcessTestExecutor ex = new 
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
+"/tika-batch-comparison-eval-config.xml");
+StreamStrings streamStrings = ex.execute();
+System.out.println(streamStrings.getErrString());
+System.out.println(streamStrings.getOutString());
+H2Util dbUtil = new H2Util(db);
+conn = dbUtil.getConnection();*/
+}
+
+@AfterClass
+public static void tearDown() throws Exception {
+
+conn.close();
+
+FileUtils.deleteDirectory(dbDir.toFile());
+}
+
+
+@Test
+public void testSimpleDBWriteAndRead() throws Exception {
+Set set = new HashSet<>();
+//filenames
+List list = getColStrings(Cols.FILE_NAME.name(),
+ExtractComparer.PROFILES_A.getName(), "");
+assertEquals(7, list.size());
+assertTrue(list.contains("file1.pdf"));
+
+//container ids in comparisons table
+list = getColStrings(Cols.CONTAINER_ID.name(),
+ExtractComparer.COMPARISON_CONTAINERS.getName(),"");
+assertEquals(10, list.size());
+set.clear(); set.addAll(list);
+assertEquals(10, set.size());
+/*
+//ids in comparisons table
+list = getColStrings(AbstractProfiler.HEADERS.ID.name(),
+compTable,"");
+assertEquals(9, list.size());
+set.clear(); set.addAll(list);
+assertEquals(9, set.size());*/
+}
+
+
+
+/*
+@Test
+public void testFile1PDFRow() throws Exception {
+String where = fp+"='file1.pdf'";
+Map data = getRow(compJoinCont, where);
+String result = 

[6/6] tika git commit: TIKA-1332 initial commit of tika-eval. More work remains.

2017-02-16 Thread tallison
TIKA-1332 initial commit of tika-eval.  More work remains.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5e49c330
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5e49c330
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5e49c330

Branch: refs/heads/2.x
Commit: 5e49c33087bbf03763b05efda3bbb96d8cc20ea4
Parents: 6bfe5d5
Author: tballison 
Authored: Thu Feb 16 12:19:54 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 12:19:54 2017 -0500

--
 CHANGES.txt |   2 +
 LICENSE.txt |   8 +
 pom.xml |   1 +
 tika-eval/pom.xml   | 281 +++
 .../org/apache/tika/eval/AbstractProfiler.java  | 693 
 .../org/apache/tika/eval/EvalFilePaths.java | 108 +++
 .../org/apache/tika/eval/ExtractComparer.java   | 455 +++
 .../org/apache/tika/eval/ExtractProfiler.java   | 238 ++
 .../java/org/apache/tika/eval/TikaEvalCLI.java  | 262 ++
 .../apache/tika/eval/XMLErrorLogUpdater.java| 226 ++
 .../tika/eval/batch/DBConsumersManager.java |  92 +++
 .../tika/eval/batch/EvalConsumerBuilder.java| 134 
 .../tika/eval/batch/EvalConsumersBuilder.java   | 133 
 .../tika/eval/batch/FileComparerBuilder.java| 122 +++
 .../eval/batch/SingleFileConsumerBuilder.java   | 108 +++
 .../apache/tika/eval/db/AbstractDBBuffer.java   |  77 ++
 .../java/org/apache/tika/eval/db/ColInfo.java   | 116 +++
 .../main/java/org/apache/tika/eval/db/Cols.java |  90 +++
 .../java/org/apache/tika/eval/db/DBBuffer.java  |  54 ++
 .../java/org/apache/tika/eval/db/DBUtil.java| 201 +
 .../java/org/apache/tika/eval/db/H2Util.java|  71 ++
 .../org/apache/tika/eval/db/MimeBuffer.java | 144 
 .../java/org/apache/tika/eval/db/TableInfo.java |  64 ++
 .../java/org/apache/tika/eval/io/DBWriter.java  | 141 
 .../org/apache/tika/eval/io/ExtractReader.java  | 161 
 .../java/org/apache/tika/eval/io/IDBWriter.java |  31 +
 .../apache/tika/eval/io/XMLLogMsgHandler.java   |  26 +
 .../org/apache/tika/eval/io/XMLLogReader.java   | 120 +++
 .../org/apache/tika/eval/reports/Report.java| 197 +
 .../tika/eval/reports/ResultsReporter.java  | 295 +++
 .../tika/eval/reports/XLSXHREFFormatter.java|  79 ++
 .../tika/eval/reports/XLSXNumFormatter.java |  54 ++
 .../tika/eval/reports/XSLXCellFormatter.java|  30 +
 .../tokens/AlphaIdeographFilterFactory.java |  74 ++
 .../tika/eval/tokens/AnalyzerDeserializer.java  | 345 
 .../tika/eval/tokens/AnalyzerManager.java   |  95 +++
 .../CJKBigramAwareLengthFilterFactory.java  |  74 ++
 .../eval/tokens/CommonTokenCountManager.java| 141 
 .../tika/eval/tokens/CommonTokenResult.java |  37 +
 .../tika/eval/tokens/ContrastStatistics.java|  78 ++
 .../tika/eval/tokens/TokenContraster.java   | 183 +
 .../eval/tokens/TokenCountPriorityQueue.java|  49 ++
 .../apache/tika/eval/tokens/TokenCounter.java   | 167 
 .../apache/tika/eval/tokens/TokenIntPair.java   |  82 ++
 .../tika/eval/tokens/TokenStatistics.java   | 127 +++
 .../tika/eval/util/LanguageIDWrapper.java   |  69 ++
 ...ache.lucene.analysis.util.TokenFilterFactory |  17 +
 .../src/main/resources/comparison-reports.xml   | 791 +++
 .../src/main/resources/lucene-analyzers.json| 107 +++
 .../src/main/resources/lucene-char-mapping.txt  |   2 +
 .../src/main/resources/profile-reports.xml  | 148 
 .../resources/tika-eval-comparison-config.xml   |  81 ++
 .../resources/tika-eval-profiler-config.xml |  76 ++
 .../test/java/org/apache/tika/MockDBWriter.java |  73 ++
 .../apache/tika/eval/AnalyzerManagerTest.java   |  79 ++
 .../org/apache/tika/eval/ComparerBatchTest.java | 411 ++
 .../org/apache/tika/eval/ProfilerBatchTest.java | 236 ++
 .../apache/tika/eval/SimpleComparerTest.java| 289 +++
 .../org/apache/tika/eval/TikaEvalCLITest.java   |  42 +
 .../apache/tika/eval/db/AbstractBufferTest.java | 160 
 .../apache/tika/eval/io/ExtractReaderTest.java  |  85 ++
 .../tika/eval/io/FatalExceptionReaderTest.java  |  32 +
 .../tika/eval/reports/ResultsReporterTest.java  |  60 ++
 .../tika/eval/tokens/LuceneTokenCounter.java| 191 +
 .../tika/eval/tokens/TokenCounterTest.java  | 131 +++
 .../org/apache/tika/eval/util/MimeUtilTest.java |  65 ++
 tika-eval/src/test/resources/commontokens/en|   8 +
 tika-eval/src/test/resources/commontokens/es|  10 +
 tika-eval/src/test/resources/commontokens/zh-cn |   8 +
 tika-eval/src/test/resources/commontokens/zh-tw |   8 +
 tika-eval/src/test/resources/log4j.properties   |  11 +
 .../src/test/resources/log4j_process.properties |  11 +
 ...ingle-file-profiler-crawl-extract-config.xml |  72 ++
 

[6/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

2017-02-16 Thread tallison
TIKA-1332 -- initial commit for tika-eval module. More work remains.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/aa7a0c35
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/aa7a0c35
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/aa7a0c35

Branch: refs/heads/master
Commit: aa7a0c353362d56cb1b8e77297f0807626b0246c
Parents: b9befb4
Author: tballison 
Authored: Thu Feb 16 12:18:32 2017 -0500
Committer: tballison 
Committed: Thu Feb 16 12:18:32 2017 -0500

--
 CHANGES.txt |   2 +
 LICENSE.txt |   8 +
 pom.xml |   1 +
 tika-eval/pom.xml   | 281 +++
 .../org/apache/tika/eval/AbstractProfiler.java  | 693 
 .../org/apache/tika/eval/EvalFilePaths.java | 108 +++
 .../org/apache/tika/eval/ExtractComparer.java   | 455 +++
 .../org/apache/tika/eval/ExtractProfiler.java   | 238 ++
 .../java/org/apache/tika/eval/TikaEvalCLI.java  | 262 ++
 .../apache/tika/eval/XMLErrorLogUpdater.java| 226 ++
 .../tika/eval/batch/DBConsumersManager.java |  92 +++
 .../tika/eval/batch/EvalConsumerBuilder.java| 134 
 .../tika/eval/batch/EvalConsumersBuilder.java   | 133 
 .../tika/eval/batch/FileComparerBuilder.java| 122 +++
 .../eval/batch/SingleFileConsumerBuilder.java   | 108 +++
 .../apache/tika/eval/db/AbstractDBBuffer.java   |  77 ++
 .../java/org/apache/tika/eval/db/ColInfo.java   | 116 +++
 .../main/java/org/apache/tika/eval/db/Cols.java |  90 +++
 .../java/org/apache/tika/eval/db/DBBuffer.java  |  54 ++
 .../java/org/apache/tika/eval/db/DBUtil.java| 201 +
 .../java/org/apache/tika/eval/db/H2Util.java|  71 ++
 .../org/apache/tika/eval/db/MimeBuffer.java | 144 
 .../java/org/apache/tika/eval/db/TableInfo.java |  64 ++
 .../java/org/apache/tika/eval/io/DBWriter.java  | 141 
 .../org/apache/tika/eval/io/ExtractReader.java  | 161 
 .../java/org/apache/tika/eval/io/IDBWriter.java |  31 +
 .../apache/tika/eval/io/XMLLogMsgHandler.java   |  26 +
 .../org/apache/tika/eval/io/XMLLogReader.java   | 120 +++
 .../org/apache/tika/eval/reports/Report.java| 197 +
 .../tika/eval/reports/ResultsReporter.java  | 295 +++
 .../tika/eval/reports/XLSXHREFFormatter.java|  79 ++
 .../tika/eval/reports/XLSXNumFormatter.java |  54 ++
 .../tika/eval/reports/XSLXCellFormatter.java|  30 +
 .../tokens/AlphaIdeographFilterFactory.java |  74 ++
 .../tika/eval/tokens/AnalyzerDeserializer.java  | 345 
 .../tika/eval/tokens/AnalyzerManager.java   |  95 +++
 .../CJKBigramAwareLengthFilterFactory.java  |  74 ++
 .../eval/tokens/CommonTokenCountManager.java| 141 
 .../tika/eval/tokens/CommonTokenResult.java |  37 +
 .../tika/eval/tokens/ContrastStatistics.java|  78 ++
 .../tika/eval/tokens/TokenContraster.java   | 183 +
 .../eval/tokens/TokenCountPriorityQueue.java|  49 ++
 .../apache/tika/eval/tokens/TokenCounter.java   | 167 
 .../apache/tika/eval/tokens/TokenIntPair.java   |  82 ++
 .../tika/eval/tokens/TokenStatistics.java   | 127 +++
 .../tika/eval/util/LanguageIDWrapper.java   |  69 ++
 ...ache.lucene.analysis.util.TokenFilterFactory |  17 +
 .../src/main/resources/comparison-reports.xml   | 791 +++
 .../src/main/resources/lucene-analyzers.json| 107 +++
 .../src/main/resources/lucene-char-mapping.txt  |   2 +
 .../src/main/resources/profile-reports.xml  | 148 
 .../resources/tika-eval-comparison-config.xml   |  83 ++
 .../resources/tika-eval-profiler-config.xml |  76 ++
 .../test/java/org/apache/tika/MockDBWriter.java |  73 ++
 .../apache/tika/eval/AnalyzerManagerTest.java   |  79 ++
 .../org/apache/tika/eval/ComparerBatchTest.java | 411 ++
 .../org/apache/tika/eval/ProfilerBatchTest.java | 236 ++
 .../apache/tika/eval/SimpleComparerTest.java| 289 +++
 .../org/apache/tika/eval/TikaEvalCLITest.java   |  42 +
 .../apache/tika/eval/db/AbstractBufferTest.java | 160 
 .../apache/tika/eval/io/ExtractReaderTest.java  |  85 ++
 .../tika/eval/io/FatalExceptionReaderTest.java  |  32 +
 .../tika/eval/reports/ResultsReporterTest.java  |  60 ++
 .../tika/eval/tokens/LuceneTokenCounter.java| 191 +
 .../tika/eval/tokens/TokenCounterTest.java  | 131 +++
 .../org/apache/tika/eval/util/MimeUtilTest.java |  65 ++
 tika-eval/src/test/resources/commontokens/en|   8 +
 tika-eval/src/test/resources/commontokens/es|  10 +
 tika-eval/src/test/resources/commontokens/zh-cn |   8 +
 tika-eval/src/test/resources/commontokens/zh-tw |   8 +
 tika-eval/src/test/resources/log4j.properties   |  11 +
 .../src/test/resources/log4j_process.properties |  11 +
 ...ingle-file-profiler-crawl-extract-config.xml |  72 ++
 

[2/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
--
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
new file mode 100644
index 000..0d925cf
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.batch.fs.FSBatchTestBase;
+import org.apache.tika.eval.db.Cols;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+@Ignore("need to fix tika-batch tests to make this work")
+public class ComparerBatchTest extends FSBatchTestBase {
+
+public final static String COMPARER_PROCESS_CLASS = 
"org.apache.tika.batch.fs.FSBatchProcessCLI";
+
+private static Path dbDir;
+private static Connection conn;
+
+private final static String compJoinCont = "";
+/*ExtractComparer.COMPARISONS_TABLE+" cmp " +
+"join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+
+"on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+
+" = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/
+
+@BeforeClass
+public static void setUp() throws Exception {
+
+File inputRoot = new 
File(ComparerBatchTest.class.getResource("/test-dirs").toURI());
+dbDir = Files.createTempDirectory(inputRoot.toPath(), 
"tika-test-db-dir-");
+Map args = new HashMap<>();
+Path db = FileSystems.getDefault().getPath(dbDir.toString(), 
"comparisons_test");
+args.put("-db", db.toString());
+
+//for debugging, you can use this to select only one file pair to load
+//args.put("-includeFilePat", "file8.*");
+/*
+BatchProcessTestExecutor ex = new 
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
+"/tika-batch-comparison-eval-config.xml");
+StreamStrings streamStrings = ex.execute();
+System.out.println(streamStrings.getErrString());
+System.out.println(streamStrings.getOutString());
+H2Util dbUtil = new H2Util(db);
+conn = dbUtil.getConnection();*/
+}
+
+@AfterClass
+public static void tearDown() throws Exception {
+
+conn.close();
+
+FileUtils.deleteDirectory(dbDir.toFile());
+}
+
+
+@Test
+public void testSimpleDBWriteAndRead() throws Exception {
+Set set = new HashSet<>();
+//filenames
+List list = getColStrings(Cols.FILE_NAME.name(),
+ExtractComparer.PROFILES_A.getName(), "");
+assertEquals(7, list.size());
+assertTrue(list.contains("file1.pdf"));
+
+//container ids in comparisons table
+list = getColStrings(Cols.CONTAINER_ID.name(),
+ExtractComparer.COMPARISON_CONTAINERS.getName(),"");
+assertEquals(10, list.size());
+set.clear(); set.addAll(list);
+assertEquals(10, set.size());
+/*
+//ids in comparisons table
+list = getColStrings(AbstractProfiler.HEADERS.ID.name(),
+compTable,"");
+assertEquals(9, list.size());
+set.clear(); set.addAll(list);
+assertEquals(9, set.size());*/
+}
+
+
+
+/*
+@Test
+public void testFile1PDFRow() throws Exception {
+String where = fp+"='file1.pdf'";
+Map data = getRow(compJoinCont, where);
+String result = 

[3/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
--
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
new file mode 100644
index 000..28e1c78
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.commons.math3.util.FastMath;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+public class TokenCounter {
+
+private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a";
+
+
+Map> map = new HashMap<>(); //Map>
+Map tokenStatistics = new HashMap<>();
+
+private final TokenStatistics NULL_TOKEN_STAT = new TokenStatistics(
+0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics());
+
+private final Analyzer generalAnalyzer;
+private final Analyzer alphaIdeoAnalyzer;
+
+private int topN = 10;
+
+public TokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeoAnalyzer) 
throws IOException {
+this.generalAnalyzer = generalAnalyzer;
+this.alphaIdeoAnalyzer = alphaIdeoAnalyzer;
+}
+
+public void add(String field, String content) throws IOException {
+_add(field, generalAnalyzer, content);
+_add(field+ALPHA_IDEOGRAPH_SUFFIX, alphaIdeoAnalyzer, content);
+}
+
+private void _add(String field, Analyzer analyzer, String content) throws 
IOException {
+int totalTokens = 0;
+
+TokenStream ts = analyzer.tokenStream(field, content);
+CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+ts.reset();
+Map tokenMap = map.get(field);
+if (tokenMap == null) {
+tokenMap = new HashMap<>();
+map.put(field, tokenMap);
+}
+while (ts.incrementToken()) {
+String token = termAtt.toString();
+MutableInt cnt = tokenMap.get(token);
+if (cnt == null) {
+cnt = new MutableInt(1);
+tokenMap.put(token, cnt);
+} else {
+cnt.increment();
+}
+totalTokens++;
+}
+ts.close();
+ts.end();
+
+int totalUniqueTokens = tokenMap.size();
+
+double ent = 0.0d;
+double p = 0.0d;
+double base = 2.0;
+
+TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
+
+SummaryStatistics summaryStatistics = new SummaryStatistics();
+for (Map.Entry e : tokenMap.entrySet()) {
+String token = e.getKey();
+int termFreq = e.getValue().intValue();
+
+p = (double) termFreq / (double) totalTokens;
+ent += p * FastMath.log(base, p);
+int len = token.codePointCount(0, token.length());
+for (int i = 0; i < e.getValue().intValue(); i++) {
+summaryStatistics.addValue(len);
+}
+if (queue.top() == null || queue.size() < topN ||
+termFreq >= queue.top().getValue()) {
+queue.insertWithOverflow(new TokenIntPair(token, termFreq));
+}
+
+}
+if (totalTokens > 0) {
+ent = (-1.0d / (double)totalTokens) * ent;
+}
+
+/*Collections.sort(allTokens);
+List topNList = new ArrayList<>(topN);
+for (int i = 0; i < topN && i < allTokens.size(); i++) {
+topNList.add(allTokens.get(i));
+}*/
+
+tokenStatistics.put(field, 

[4/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

2017-02-16 Thread tallison
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
--
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
new file mode 100644
index 000..cd90f76
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -0,0 +1,161 @@
+package org.apache.tika.eval.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class ExtractReader {
+
+public enum ALTER_METADATA_LIST {
+AS_IS,  //leave the metadata list as is
+FIRST_ONLY, //take only the metadata list for the "container" document
+CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into 
the first
+}
+private final static Logger LOGGER = 
LoggerFactory.getLogger(ExtractReader.class);
+TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+public List loadExtract(Path thisFile, ALTER_METADATA_LIST 
alterExtractList) {
+List metadataList = null;
+if (thisFile == null || !Files.isRegularFile(thisFile)) {
+return metadataList;
+}
+Reader reader = null;
+InputStream is = null;
+FileSuffixes fileSuffixes = 
parseSuffixes(thisFile.getFileName().toString());
+if (fileSuffixes.txtOrJson == null) {
+LOGGER.warn("file must end with .txt or .json: 
"+thisFile.getFileName().toString());
+return metadataList;
+}
+
+try {
+is = Files.newInputStream(thisFile);
+if (fileSuffixes.compression != null) {
+if (fileSuffixes.compression.equals("bz2")) {
+is = new BZip2CompressorInputStream(is);
+} else if (fileSuffixes.compression.equals("gz")) {
+is = new GzipCompressorInputStream(is);
+} else if (fileSuffixes.compression.equals("zip")) {
+is = new ZCompressorInputStream(is);
+} else {
+LOGGER.warn("Can't yet process compression of type: 
"+fileSuffixes.compression);
+}
+}
+reader = new BufferedReader(new InputStreamReader(is, 
"UTF-8"));
+
+if (fileSuffixes.txtOrJson.equals("json")) {
+metadataList = JsonMetadataList.fromJson(reader);
+if (alterExtractList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && 
metadataList.size() > 1) {
+while (metadataList.size() > 1) {
+metadataList.remove(metadataList.size()-1);
+}
+} else if 
(alterExtractList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST)
 &&
+metadataList.size() > 1) {
+StringBuilder sb = new StringBuilder();
+Metadata containerMetadata = metadataList.get(0);
+for (int i = 0; i < metadataList.size(); i++) {
+Metadata m = metadataList.get(i);
+String c = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+if (c 

[1/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

2017-02-16 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master b9befb427 -> aa7a0c353


http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
--
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
new file mode 100644
index 000..6ef09de
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
@@ -0,0 +1,5 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog 
1,200 12",
+  "xmpTPg:NPages":2
+}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json
new file mode 100644
index 000..e69de29

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json
new file mode 100644
index 000..e69de29

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
new file mode 100644
index 000..0e2558b
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
@@ -0,0 +1,4 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"El zorro marrón rápido saltó sobre el perro. El zorro 
marrón rápido saltó sobre el perro. El zorro marrón rápido saltó sobre el 
perro"
+}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
new file mode 100644
index 000..5371c87
--- /dev/null
+++ 
b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
@@ -0,0 +1,10 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+  },
+  {
+"Content-Type":"text/plain",
+"X-TIKA:embedded_resource_path":"inner.txt",
+"X-TIKA:content":"attachment contents"
+  }
+]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
new file mode 100644
index 000..5371c87
--- /dev/null
+++ 
b/tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
@@ -0,0 +1,10 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+  },
+  {
+"Content-Type":"text/plain",
+"X-TIKA:embedded_resource_path":"inner.txt",
+"X-TIKA:content":"attachment contents"
+  }
+]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
new file mode 100644
index 000..18763d1
--- /dev/null
+++ 
b/tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
@@ -0,0 +1,4 @@
+[{
+  "Content-Type":"text/plain",
+  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+}]
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json
--
diff --git 
a/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json 
b/tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json
new file mode 

[Tika Wiki] Update of "MockParser" by TimothyAllison

2017-02-16 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Tika Wiki" for change 
notification.

The "MockParser" page has been changed by TimothyAllison:
https://wiki.apache.org/tika/MockParser?action=diff=3=4

  
  Please note that for 3., permanent hangs -- you cannot terminate the Thread.  
Thread's ''stop'', ''suspend'', ''destroy'' sound like they'll do the trick, 
but they won't. '''You need to kill the entire process.'''
  
- As of Tika 1.15, we added a MockParser in the tika-core-tests.jar that will 
allow you to test your framework against 1-3.  Simply add that jar to your 
class path and then include a  xml file in your set of test documents, 
and crash, crash away.
+ As of Tika 1.15, we added a MockParser in the tika-core-tests.jar that will 
allow you to test your framework against items 1-3.  Simply add that jar to 
your class path and then include a  xml file in your set of test 
documents, and crash, crash away.
  
  == Usage ==
  
@@ -36, +36 @@

  
  === Your Framework ===
  Place the tika-core-tests.jar on your class path (NOT IN PRODUCTION!!!) and 
then add some mock.xml files to your batch of documents.
- 
  
  
  === Mock options ===


[Tika Wiki] Update of "MockParser" by TimothyAllison

2017-02-16 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Tika Wiki" for change 
notification.

The "MockParser" page has been changed by TimothyAllison:
https://wiki.apache.org/tika/MockParser?action=diff=1=2

  == Background ==
  So, you've tried Tika on a couple of files and all works well.  Problem 
solved!
  
+ No. 
+ 
- No. In very rare cases, Tika can so some really bad things.  We try to fix 
these problems when we can, but if history is any indication (e.g. 
[[https://issues.apache.org/jira/browse/TIKA-1132|TIKA-1132]]), if you are 
processing millions of files, you'll need to defend against:
+ In very rare cases, Tika can so some really bad things.  We try to fix these 
problems when we can, but if history is any indication (e.g. 
[[https://issues.apache.org/jira/browse/TIKA-1132|TIKA-1132]]), if you are 
processing millions/billions of files from the wild, you'll need to defend 
against:
  
   1. Regular catchable exceptions
   2. !OutOfMemory errors which can put the jvm in an unreliable state
@@ -24, +26 @@

  `java -cp "bin/*" org.apache.tika.TikaCLI mock_example.xml`
  
  === Tika-server ===
- Place the tika-server.jar and the tika-core.tests.jar in a "bin directory.
+ Place the tika-server.jar and the tika-core.tests.jar in a "bin" directory.
  
- `java -cp "serverbin/*" org.apache.tika.server.TikaServerCli`
+ `java -cp "bin/*" org.apache.tika.server.TikaServerCli`
+ 
+ Then curl away:
+ 
+ `curl -T mock_example.xml http://localhost:9998/rmeta/text`
  
  === Your Framework ===
  Place the tika-core-tests.jar on your class path (NOT IN PRODUCTION!!!) and 
then add some mock.xml files to your batch of documents.
  
  
- 
- Then curl away:
- 
- `curl -T mock_example.xml http://localhost:9998/rmeta/text`
  
  === Mock options ===
  See the mock example.xml file in 
tika-parsers/src/test/resources/test-documents/mock.  
@@ -84, +86 @@

  
  
  ``
+ == References ==
+  1. 
[[http://openpreservation.org/blog/2014/03/21/tika-ride-characterising-web-content-nanite/|Tika
 to Ride]]
+  2. 
[[http://events.linuxfoundation.org/sites/events/files/slides/TikaEval_ACNA15_allison_herceg_v2.pdf|Evaluating
 Text Extraction]]
  


[Tika Wiki] Update of "MockParser" by TimothyAllison

2017-02-16 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Tika Wiki" for change 
notification.

The "MockParser" page has been changed by TimothyAllison:
https://wiki.apache.org/tika/MockParser

New page:
= MockParser =

== Background ==
So, you've tried Tika on a couple of files and all works well.  Problem solved!

No. In very rare cases, Tika can so some really bad things.  We try to fix 
these problems when we can, but if history is any indication (e.g. 
[[https://issues.apache.org/jira/browse/TIKA-1132|TIKA-1132]]), if you are 
processing millions of files, you'll need to defend against:

 1. Regular catchable exceptions
 2. !OutOfMemory errors which can put the jvm in an unreliable state
 3. Permanent hangs (Tika can chew up massive amounts of resources and go 
''forever'')
 4. Security vulnerabilities (e.g. 
[[http://seclists.org/bugtraq/2016/Nov/40|CVE-2016-6809]] and 
[[http://seclists.org/oss-sec/2016/q2/413|CVE-2016-4434]])

Please note that for 3., permanent hangs -- you cannot terminate the Thread.  
Thread's ''stop'', ''suspend'', ''destroy'' sound like they'll do the trick, 
but they won't. '''You need to kill the entire process.'''

As of Tika 1.15, we added a MockParser in the tika-core-tests.jar that will 
allow you to test your framework against 1-3.  Simply add that jar to your 
class path and then include a  xml file in your set of test documents, 
and crash, crash away.

== Usage ==

=== Tika-app ===
Place the tika-app.jar and the tika-core-tests.jar in a "bin" directory.

`java -cp "bin/*" org.apache.tika.TikaCLI mock_example.xml`

=== Tika-server ===
Place the tika-server.jar and the tika-core.tests.jar in a "bin directory.

`java -cp "serverbin/*" org.apache.tika.server.TikaServerCli`

=== Your Framework ===
Place the tika-core-tests.jar on your class path (NOT IN PRODUCTION!!!) and 
then add some mock.xml files to your batch of documents.



Then curl away:

`curl -T mock_example.xml http://localhost:9998/rmeta/text`

=== Mock options ===
See the mock example.xml file in 
tika-parsers/src/test/resources/test-documents/mock.  

This shows all of the examples of what you can do.
```






Nikolai Lobachevsky



some content


writing to System.out


writing to System.err





not another IOException





``


[Tika Wiki] Update of "FrontPage" by TimothyAllison

2017-02-16 Thread Apache Wiki
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Tika Wiki" for change 
notification.

The "FrontPage" page has been changed by TimothyAllison:
https://wiki.apache.org/tika/FrontPage?action=diff=56=57

   * [[Troubleshooting Tika]]
   * [[TikaParserNotes|Notes on Specific Parsers]]
   * [[TikaEval|Using the tika-eval Module]]
+  * [[MockParser|How to Test Your Framework's Handling of Tika Behaving Badly]]
  
  = MIME identification design/implementation =
   * [[BaysianMimeTypeSelector|Bayesian MIME selection]] - Tika's new Bayesian 
MIME selector.