[
https://issues.apache.org/jira/browse/TIKA-2762?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16669473#comment-16669473
]
ASF GitHub Bot commented on TIKA-2762:
--------------------------------------
tballison closed pull request #251: TIKA-2762 Capture short fields (<150 chars)
in EnviParserHeader Metadata
URL: https://github.com/apache/tika/pull/251
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index f7aea79fd..2aabda4db 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -17,7 +17,6 @@
package org.apache.tika.config;
import javax.imageio.spi.ServiceRegistry;
-import javax.xml.parsers.DocumentBuilder;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -60,7 +59,6 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.multiple.AbstractMultipleParser;
diff --git
a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
index 3b9d373f4..c2ddb2fd0 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.junit.Test;
@@ -191,7 +192,20 @@ public void testDetectStreamReadProblems() throws
Exception {
InputStream stream = new RestrictiveInputStream(data);
assertEquals(testMT, detector.detect(stream, new Metadata()));
}
-
+
+ @Test
+ public void testDetectApplicationEnviHdr() throws Exception {
+ InputStream iStream = MagicDetectorTest.class.getResourceAsStream(
+ "/test-documents/ang20150420t182050_corr_v1e_img.hdr");
+ byte[] data = IOUtils.toByteArray(iStream);
+ MediaType testMT = new MediaType("application", "envi.hdr");
+ Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
+ // Deliberately prevent InputStream.read(...) from reading the entire
+ // buffer in one go
+ InputStream stream = new RestrictiveInputStream(data);
+ assertEquals(testMT, detector.detect(stream, new Metadata()));
+ }
+
@Test
public void testDetectString() throws Exception {
String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
diff --git
a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
index 7e4839b07..f13346594 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
@@ -44,6 +44,7 @@ public void setUp() {
Pattern.compile(".*\\.txt", Pattern.CASE_INSENSITIVE),
MediaType.TEXT_PLAIN);
patterns.put(Pattern.compile("README"), MediaType.TEXT_PLAIN);
+ patterns.put(Pattern.compile(".*\\.hdr"),
MediaType.application("envi.hdr"));
detector = new NameDetector(patterns);
}
@@ -83,6 +84,8 @@ public void testDetect() {
assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this!
assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this
+ assertDetect(MediaType.application("envi.hdr"),
"ang20150420t182050_corr_v1e_img.hdr");
+
// test also the zero input cases
assertDetect(MediaType.OCTET_STREAM, "");
assertDetect(MediaType.OCTET_STREAM, null);
@@ -104,5 +107,4 @@ private void assertDetect(MediaType type, String name){
fail("NameDetector should never throw an IOException");
}
}
-
}
diff --git
a/tika-core/src/test/resources/test-documents/ang20150420t182050_corr_v1e_img.hdr
b/tika-core/src/test/resources/test-documents/ang20150420t182050_corr_v1e_img.hdr
new file mode 100644
index 000000000..ba44396cc
--- /dev/null
+++
b/tika-core/src/test/resources/test-documents/ang20150420t182050_corr_v1e_img.hdr
@@ -0,0 +1,20 @@
+ENVI
+description = {
+ Georeferenced Image built from input GLT. [Wed Jun 10 04:37:54 2015] [Wed
+ Jun 10 04:48:52 2015]}
+samples = 739
+lines = 14674
+bands = 432
+header offset = 0
+file type = ENVI Standard
+data type = 4
+interleave = bil
+sensor type = Unknown
+byte order = 0
+map info = { UTM , 1.000 , 1.000 , 724522.127 , 4074620.759 , 1.1000000000e+00
, 1.1000000000e+00 , 12 , North , WGS-84 , units=Meters , rotation=75.00000000 }
+wavelength units = Nanometers
+correction factors = { 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.880586 , 1.741631 , 1.524632 , 1.564904 , 1.390035 , 1.342501 , 1.264768 ,
1.200357 , 1.131471 , 1.06512 , 1.105211 , 1.081234 , 1.064999 , 1.056798 ,
1.042573 , 1.009621 , 1.009031 , 1.002107 , 0.997332 , 0.976807 , 0.963755 ,
0.969028 , 0.962823 , 0.949522 , 0.960568 , 0.957813 , 0.934766 , 0.944994 ,
0.937726 , 0.935257 , 0.932706 , 0.932568 , 0.933217 , 0.928705 , 0.929294 ,
0.936669 , 0.935498 , 0.94823 , 0.949846 , 0.945885 , 0.935468 , 0.930084 ,
0.934473 , 0.935378 , 0.939193 , 0.935081 , 0.937398 , 0.943396 , 0.947133 ,
0.950645 , 0.945531 , 0.940295 , 0.933129 , 0.930664 , 0.92736 , 0.931786 ,
0.928217 , 0.928205 , 0.926481 , 0.928583 , 0.930504 , 0.93648 , 0.930731 ,
0.931265 , 0.935063 , 0.93434 , 0.926983 , 0.932689 , 0.936477 , 0.939647 ,
0.940155 , 0.937519 , 0.939448 , 0.942124 , 0.93653 , 0.9435 , 0.959204 ,
0.942566 , 0.940873 , 0.939414 , 0.939822 , 0.940174 , 0.941372 , 0.939347 ,
0.942108 , 0.942664 , 0.934811 , 0.934567 , 0.937712 , 0.940611 , 0.944809 ,
0.939877 , 0.943376 , 0.939189 , 0.943619 , 0.946268 , 0.940166 , 0.953752 ,
0.958975 , 0.954512 , 0.954103 , 0.958978 , 0.953247 , 0.952199 , 0.956082 ,
0.957846 , 0.970078 , 0.973704 , 0.980014 , 0.928845 , 0.922973 , 0.954414 ,
0.95521 , 0.961276 , 0.964513 , 0.965296 , 0.964644 , 0.954999 , 0.951133 ,
0.956216 , 0.951977 , 0.948547 , 0.949499 , 0.952685 , 0.950158 , 0.944263 ,
0.936946 , 0.938394 , 0.941325 , 0.94116 , 0.941397 , 0.940811 , 0.942695 ,
0.945228 , 0.953929 , 0.962457 , 0.968728 , 0.963947 , 0.961222 , 0.963003 ,
0.967658 , 0.969773 , 0.970294 , 0.963456 , 0.970497 , 0.976972 , 0.961611 ,
0.953081 , 0.945668 , 0.993867 , 1.019915 , 0.997013 , 0.977643 , 0.998022 ,
1.007041 , 1.003881 , 0.991335 , 0.976202 , 0.967636 , 0.969294 , 0.965331 ,
0.968705 , 0.965705 , 0.973601 , 0.97282 , 0.970848 , 0.970687 , 0.969394 ,
0.972263 , 0.969286 , 0.970327 , 0.97754 , 0.984703 , 0.993916 , 1.02186 ,
1.054704 , 1.061183 , 1.036962 , 1.012519 , 0.991209 , 0.975974 , 0.965446 ,
0.958801 , 0.951519 , 0.960628 , 0.957276 , 0.96061 , 0.95689 , 0.956666 ,
0.965567 , 0.982251 , 1.038526 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0
, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.131643 , 1.150333 ,
1.112466 , 1.06255 , 1.033182 , 1.03018 , 1.006512 , 0.993285 , 1.003557 ,
1.014441 , 0.998586 , 0.994426 , 0.987975 , 0.984681 , 0.981863 , 0.964106 ,
0.956432 , 0.954467 , 0.956429 , 0.950668 , 0.947217 , 0.944635 , 0.942942 ,
0.941078 , 0.943502 , 0.950408 , 0.957625 , 0.965073 , 0.976012 , 0.976683 ,
0.975625 , 0.968011 , 0.968843 , 0.970632 , 0.960977 , 0.960505 , 0.955015 ,
0.953597 , 0.951119 , 0.945679 , 0.949988 , 0.951236 , 0.947813 , 0.948004 ,
0.950015 , 0.939258 , 0.945863 , 0.953927 , 0.953145 , 0.945291 , 0.942319 ,
0.947022 , 0.948264 , 0.947112 , 0.942092 , 0.943128 , 0.948068 , 0.944432 ,
0.950396 , 0.964006 , 0.961019 , 0.951786 , 0.957457 , 0.950327 , 0.954375 ,
0.9608 , 0.965864 , 0.982396 , 1.011334 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.289798 , 1.126375 , 1.074621 , 1.069365 , 1.051364 ,
1.009432 , 0.984578 , 0.991187 , 1.016306 , 1.083681 , 1.076539 , 1.104657 ,
1.114005 , 1.081102 , 1.039196 , 1.009524 , 0.966515 , 0.953688 , 0.966274 ,
0.973232 , 0.966402 , 0.946801 , 0.951952 , 0.965679 , 0.96977 , 0.946541 ,
0.941678 , 0.937528 , 0.922351 , 0.914192 , 0.92879 , 0.932284 , 0.933182 ,
0.922322 , 0.91851 , 0.919591 , 0.925027 , 0.924611 , 0.932288 , 0.933352 ,
0.930517 , 0.931666 , 0.931763 , 0.932655 , 0.928945 , 0.933308 , 0.932392 ,
0.932943 , 0.935328 , 0.947019 , 0.954093 , 0.95156 , 0.939591 , 0.942808 ,
0.944862 , 0.944004 , 0.949161 , 0.950992 , 0.956738 , 0.951184 , 0.953545 ,
0.958836 , 0.966134 , 0.956752 , 0.951961 , 0.958667 , 0.9579 , 0.968531 ,
0.973792 , 0.969238 , 0.970838 , 0.954552 , 0.968166 , 0.989176 , 0.974784 ,
0.970674 , 0.9733 , 0.990576 , 1.0062 , 1.010295 , 0.99378 , 0.986109 ,
1.007054 , 1.005377 , 1.010013 , 1.014671 , 1.021618 , 1.021229 , 1.021003 ,
1.020866 , 1.029358 , 1.042136 , 1.030482 , 1.019556 , 1.036656 , 1.05348 ,
1.015947 , 1.07263 , 1.092879 , 1.053624 , 1.086491 , 1.139334 , 1.163645 ,
1.162487 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }
+wavelength = { 346.2995778 , 351.3082478 , 356.3169178 , 361.3255878 ,
366.3342578 , 371.3429278 , 376.3515978 , 381.3602678 , 386.3689378 ,
391.3776078 , 396.3862778 , 401.3949478 , 406.4036178 , 411.4122878 ,
416.4209578 , 421.4296278 , 426.4382978 , 431.4469678 , 436.4556378 ,
441.4643078 , 446.4729778 , 451.4816478 , 456.4903178 , 461.4989878 ,
466.5076578 , 471.5163278 , 476.5249978 , 481.5336678 , 486.5423378 ,
491.5510078 , 496.5596778 , 501.5683478 , 506.5770178 , 511.5856878 ,
516.5943578 , 521.6030278 , 526.6116978 , 531.6203678 , 536.6290378 ,
541.6377078 , 546.6463778 , 551.6550478 , 556.6637178 , 561.6723878 ,
566.6810578 , 571.6897278 , 576.6983978 , 581.7070678 , 586.7157378 ,
591.7244078 , 596.7330778 , 601.7417478 , 606.7504178 , 611.7590878 ,
616.7677578 , 621.7764278 , 626.7850978 , 631.7937678 , 636.8024378 ,
641.8111078 , 646.8197778 , 651.8284478 , 656.8371178 , 661.8457878 ,
666.8544578 , 671.8631278 , 676.8717978 , 681.8804678 , 686.8891378 ,
691.8978078 , 696.9064778 , 701.9151478 , 706.9238178 , 711.9324878 ,
716.9411578 , 721.9498278 , 726.9584978 , 731.9671678 , 736.9758378 ,
741.9845078 , 746.9931778 , 752.0018478 , 757.0105178 , 762.0191878 ,
767.0278578 , 772.0365278 , 777.0451978 , 782.0538678 , 787.0625378 ,
792.0712078 , 797.0798778 , 802.0885478 , 807.0972178 , 812.1058878 ,
817.1145578 , 822.1232278 , 827.1318978 , 832.1405678 , 837.1492378 ,
842.1579078 , 847.1665778 , 852.1752478 , 857.1839178 , 862.1925878 ,
867.2012578 , 872.2099278 , 877.2185978 , 882.2272678 , 887.2359378 ,
892.2446078 , 897.2532778 , 902.2619478 , 907.2706178 , 912.2792878 ,
917.2879578 , 922.2966278 , 927.3052978 , 932.3139678 , 937.3226378 ,
942.3313078 , 947.3399778 , 952.3486478 , 957.3573178 , 962.3659878 ,
967.3746578 , 972.3833278 , 977.3919978 , 982.4006678 , 987.4093378 ,
992.4180078 , 997.4266778 , 1002.4353478 , 1007.4440178 , 1012.4526878 ,
1017.4613578 , 1022.4700278 , 1027.4786978 , 1032.4873678 , 1037.4960378 ,
1042.5047078 , 1047.5133778 , 1052.5220478 , 1057.5307178 , 1062.5393878 ,
1067.5480578 , 1072.5567278 , 1077.5653978 , 1082.5740778 , 1087.5827478 ,
1092.5914078 , 1097.6000778 , 1102.6087478 , 1107.6174278 , 1112.6260878 ,
1117.6347678 , 1122.6434278 , 1127.6520978 , 1132.6607678 , 1137.6694478 ,
1142.6781078 , 1147.6867778 , 1152.6954478 , 1157.7041278 , 1162.7127878 ,
1167.7214678 , 1172.7301378 , 1177.7388078 , 1182.7474778 , 1187.7561478 ,
1192.7648178 , 1197.7734878 , 1202.7821578 , 1207.7908278 , 1212.7994978 ,
1217.8081678 , 1222.8168378 , 1227.8255078 , 1232.8341778 , 1237.8428478 ,
1242.8515178 , 1247.8601878 , 1252.8688578 , 1257.8775278 , 1262.8861978 ,
1267.8948678 , 1272.9035378 , 1277.9122078 , 1282.9208778 , 1287.9295478 ,
1292.9382178 , 1297.9468878 , 1302.9555578 , 1307.9642278 , 1312.9728978 ,
1317.9815678 , 1322.9902378 , 1327.9989078 , 1333.0075778 , 1338.0162478 ,
1343.0249178 , 1348.0335878 , 1353.0422578 , 1358.0509278 , 1363.0595978 ,
1368.0682678 , 1373.0769378 , 1378.0856078 , 1383.0942778 , 1388.1029478 ,
1393.1116178 , 1398.1202878 , 1403.1289578 , 1408.1376278 , 1413.1462978 ,
1418.1549678 , 1423.1636378 , 1428.1723078 , 1433.1809778 , 1438.1896478 ,
1443.1983178 , 1448.2069878 , 1453.2156578 , 1458.2243278 , 1463.2329978 ,
1468.2416678 , 1473.2503378 , 1478.2590078 , 1483.2676778 , 1488.2763478 ,
1493.2850178 , 1498.2936878 , 1503.3023578 , 1508.3110278 , 1513.3196978 ,
1518.3283678 , 1523.3370378 , 1528.3457078 , 1533.3543778 , 1538.3630478 ,
1543.3717178 , 1548.3803878 , 1553.3890578 , 1558.3977278 , 1563.4063978 ,
1568.4150678 , 1573.4237378 , 1578.4324078 , 1583.4410778 , 1588.4497478 ,
1593.4584178 , 1598.4670878 , 1603.4757578 , 1608.4844278 , 1613.4930978 ,
1618.5017678 , 1623.5104378 , 1628.5191078 , 1633.5277778 , 1638.5364478 ,
1643.5451178 , 1648.5537878 , 1653.5624578 , 1658.5711278 , 1663.5797978 ,
1668.5884678 , 1673.5971378 , 1678.6058078 , 1683.6144778 , 1688.6231478 ,
1693.6318178 , 1698.6404878 , 1703.6491578 , 1708.6578278 , 1713.6664978 ,
1718.6751678 , 1723.6838378 , 1728.6925078 , 1733.7011778 , 1738.7098478 ,
1743.7185178 , 1748.7271878 , 1753.7358578 , 1758.7445278 , 1763.7531978 ,
1768.7618678 , 1773.7705378 , 1778.7792078 , 1783.7878778 , 1788.7965478 ,
1793.8052178 , 1798.8138878 , 1803.8225578 , 1808.8312278 , 1813.8398978 ,
1818.8485678 , 1823.8572378 , 1828.8659078 , 1833.8745778 , 1838.8832478 ,
1843.8919178 , 1848.9005878 , 1853.9092578 , 1858.9179278 , 1863.9265978 ,
1868.9352678 , 1873.9439378 , 1878.9526078 , 1883.9612778 , 1888.9699478 ,
1893.9786178 , 1898.9872878 , 1903.9959578 , 1909.0046278 , 1914.0132978 ,
1919.0219778 , 1924.0306378 , 1929.0393178 , 1934.0479778 , 1939.0566478 ,
1944.0653278 , 1949.0739978 , 1954.0826578 , 1959.0913278 , 1964.1000078 ,
1969.1086778 , 1974.1173378 , 1979.1260078 , 1984.1346778 , 1989.1433578 ,
1994.1520178 , 1999.1606878 , 2004.1693678 , 2009.1780378 , 2014.1867078 ,
2019.1953778 , 2024.2040478 , 2029.2127178 , 2034.2213878 , 2039.2300578 ,
2044.2387278 , 2049.2473978 , 2054.2560678 , 2059.2647378 , 2064.2734078 ,
2069.2820778 , 2074.2907478 , 2079.2994178 , 2084.3080878 , 2089.3167578 ,
2094.3254278 , 2099.3340978 , 2104.3427678 , 2109.3514378 , 2114.3601078 ,
2119.3687778 , 2124.3774478 , 2129.3861178 , 2134.3947878 , 2139.4034578 ,
2144.4121278 , 2149.4207978 , 2154.4294678 , 2159.4381378 , 2164.4468078 ,
2169.4554778 , 2174.4641478 , 2179.4728178 , 2184.4814878 , 2189.4901578 ,
2194.4988278 , 2199.5074978 , 2204.5161678 , 2209.5248378 , 2214.5335078 ,
2219.5421778 , 2224.5508478 , 2229.5595178 , 2234.5681878 , 2239.5768578 ,
2244.5855278 , 2249.5941978 , 2254.6028678 , 2259.6115378 , 2264.6202078 ,
2269.6288778 , 2274.6375478 , 2279.6462178 , 2284.6548878 , 2289.6635578 ,
2294.6722278 , 2299.6808978 , 2304.6895678 , 2309.6982378 , 2314.7069078 ,
2319.7155778 , 2324.7242478 , 2329.7329178 , 2334.7415878 , 2339.7502578 ,
2344.7589278 , 2349.7675978 , 2354.7762678 , 2359.7849378 , 2364.7936078 ,
2369.8022778 , 2374.8109478 , 2379.8196178 , 2384.8282878 , 2389.8369578 ,
2394.8456278 , 2399.8542978 , 2404.8629678 , 2409.8716378 , 2414.8803078 ,
2419.8889778 , 2424.8976478 , 2429.9063178 , 2434.9149878 , 2439.9236578 ,
2444.9323278 , 2449.9409978 , 2454.9496678 , 2459.9583378 , 2464.9670078 ,
2469.9756778 , 2474.9843478 , 2479.9930178 , 2485.0016878 , 2490.0103578 ,
2495.0190278 , 2500.0276978 , 2505.0363678 }
+fwhm = { 5.55165 , 5.55524 , 5.55879 , 5.5623 , 5.56577 , 5.5692 , 5.5726 ,
5.57595 , 5.57927 , 5.58255 , 5.58579 , 5.58899 , 5.59216 , 5.59529 , 5.59839 ,
5.60144 , 5.60446 , 5.60745 , 5.6104 , 5.61332 , 5.61619 , 5.61904 , 5.62185 ,
5.62463 , 5.62737 , 5.63008 , 5.63275 , 5.6354 , 5.63801 , 5.64058 , 5.64313 ,
5.64564 , 5.64812 , 5.65057 , 5.65299 , 5.65537 , 5.65773 , 5.66005 , 5.66235 ,
5.66461 , 5.66684 , 5.66905 , 5.67122 , 5.67337 , 5.67549 , 5.67758 , 5.67964 ,
5.68167 , 5.68367 , 5.68565 , 5.6876 , 5.68952 , 5.69141 , 5.69328 , 5.69512 ,
5.69694 , 5.69873 , 5.70049 , 5.70223 , 5.70394 , 5.70563 , 5.70729 , 5.70893 ,
5.71055 , 5.71214 , 5.7137 , 5.71524 , 5.71676 , 5.71826 , 5.71973 , 5.72118 ,
5.72261 , 5.72401 , 5.72539 , 5.72676 , 5.72809 , 5.72941 , 5.73071 , 5.73198 ,
5.73324 , 5.73447 , 5.73569 , 5.73688 , 5.73806 , 5.73921 , 5.74034 , 5.74146 ,
5.74256 , 5.74363 , 5.74469 , 5.74573 , 5.74676 , 5.74776 , 5.74875 , 5.74972 ,
5.75067 , 5.7516 , 5.75252 , 5.75342 , 5.75431 , 5.75518 , 5.75603 , 5.75687 ,
5.75769 , 5.75849 , 5.75928 , 5.76006 , 5.76082 , 5.76156 , 5.76229 , 5.76301 ,
5.76371 , 5.7644 , 5.76507 , 5.76573 , 5.76638 , 5.76702 , 5.76764 , 5.76825 ,
5.76884 , 5.76943 , 5.77 , 5.77056 , 5.77111 , 5.77164 , 5.77217 , 5.77268 ,
5.77319 , 5.77368 , 5.77416 , 5.77463 , 5.77509 , 5.77554 , 5.77598 , 5.77641 ,
5.77683 , 5.77725 , 5.77765 , 5.77804 , 5.77843 , 5.7788 , 5.77917 , 5.77953 ,
5.77988 , 5.78022 , 5.78056 , 5.78088 , 5.7812 , 5.78151 , 5.78182 , 5.78212 ,
5.78241 , 5.78269 , 5.78297 , 5.78324 , 5.78351 , 5.78377 , 5.78402 , 5.78427 ,
5.78452 , 5.78475 , 5.78499 , 5.78521 , 5.78543 , 5.78565 , 5.78587 , 5.78607 ,
5.78628 , 5.78648 , 5.78668 , 5.78687 , 5.78706 , 5.78724 , 5.78743 , 5.78761 ,
5.78778 , 5.78796 , 5.78813 , 5.7883 , 5.78846 , 5.78862 , 5.78879 , 5.78895 ,
5.7891 , 5.78926 , 5.78941 , 5.78957 , 5.78972 , 5.78987 , 5.79002 , 5.79017 ,
5.79032 , 5.79046 , 5.79061 , 5.79076 , 5.79091 , 5.79105 , 5.7912 , 5.79135 ,
5.7915 , 5.79164 , 5.79179 , 5.79194 , 5.79209 , 5.79224 , 5.7924 , 5.79255 ,
5.79271 , 5.79286 , 5.79302 , 5.79318 , 5.79335 , 5.79351 , 5.79368 , 5.79385 ,
5.79402 , 5.79419 , 5.79437 , 5.79455 , 5.79473 , 5.79492 , 5.79511 , 5.7953 ,
5.7955 , 5.7957 , 5.7959 , 5.7961 , 5.79631 , 5.79653 , 5.79675 , 5.79697 ,
5.7972 , 5.79743 , 5.79766 , 5.7979 , 5.79815 , 5.7984 , 5.79865 , 5.79891 ,
5.79918 , 5.79945 , 5.79972 , 5.80001 , 5.80029 , 5.80058 , 5.80088 , 5.80119 ,
5.8015 , 5.80181 , 5.80213 , 5.80246 , 5.8028 , 5.80314 , 5.80349 , 5.80384 ,
5.8042 , 5.80457 , 5.80494 , 5.80532 , 5.80571 , 5.80611 , 5.80651 , 5.80692 ,
5.80734 , 5.80777 , 5.8082 , 5.80864 , 5.80909 , 5.80955 , 5.81001 , 5.81048 ,
5.81096 , 5.81145 , 5.81195 , 5.81246 , 5.81297 , 5.81349 , 5.81402 , 5.81456 ,
5.81511 , 5.81567 , 5.81623 , 5.81681 , 5.81739 , 5.81799 , 5.81859 , 5.8192 ,
5.81982 , 5.82045 , 5.82109 , 5.82174 , 5.8224 , 5.82307 , 5.82374 , 5.82443 ,
5.82513 , 5.82584 , 5.82656 , 5.82728 , 5.82802 , 5.82877 , 5.82953 , 5.8303 ,
5.83108 , 5.83186 , 5.83266 , 5.83347 , 5.8343 , 5.83513 , 5.83597 , 5.83682 ,
5.83769 , 5.83856 , 5.83944 , 5.84034 , 5.84125 , 5.84217 , 5.8431 , 5.84404 ,
5.84499 , 5.84595 , 5.84692 , 5.84791 , 5.84891 , 5.84992 , 5.85093 , 5.85197 ,
5.85301 , 5.85406 , 5.85513 , 5.85621 , 5.85729 , 5.8584 , 5.85951 , 5.86063 ,
5.86177 , 5.86292 , 5.86408 , 5.86525 , 5.86643 , 5.86763 , 5.86884 , 5.87005 ,
5.87129 , 5.87253 , 5.87379 , 5.87506 , 5.87634 , 5.87763 , 5.87893 , 5.88025 ,
5.88158 , 5.88292 , 5.88428 , 5.88564 , 5.88702 , 5.88841 , 5.88982 , 5.89123 ,
5.89266 , 5.8941 , 5.89556 , 5.89702 , 5.8985 , 5.89999 , 5.90149 , 5.90301 ,
5.90454 , 5.90608 , 5.90763 , 5.9092 , 5.91078 , 5.91237 , 5.91398 , 5.91559 ,
5.91722 , 5.91886 , 5.92052 , 5.92219 , 5.92387 , 5.92556 , 5.92727 , 5.92898 ,
5.93071 , 5.93246 , 5.93421 , 5.93598 , 5.93776 , 5.93956 , 5.94137 , 5.94319 ,
5.94502 , 5.94686 , 5.94872 , 5.95059 , 5.95247 , 5.95437 , 5.95628 , 5.9582 ,
5.96013 , 5.96207 , 5.96403 , 5.966 , 5.96799 , 5.96998 , 5.97199 , 5.97401 ,
5.97604 , 5.97809 , 5.98015 , 5.98222 , 5.9843 , 5.98639 , 5.9885 , 5.99062 ,
5.99275 , 5.9949 , 5.99706 , 5.99922 , 6.00141 , 6.0036 , 6.0058 , 6.00802 ,
6.01025 , 6.01249 , 6.01475 , 6.01701 , 6.01929 , 6.02158 , 6.02388 , 6.02619 ,
6.02852 }
+bbl = { 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0
, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 ,
0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 }
+smoothing factors = { 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ,
1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
index 6884a7ef0..10d8845e7 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java
@@ -89,7 +89,7 @@ public void parse(InputStream stream, ContentHandler handler,
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- readLines(reader);
+ readLines(reader, metadata);
xhtml.endDocument();
} catch (IOException | TikaException e) {
LOG.error("Error reading input data stream.", e);
@@ -97,25 +97,31 @@ public void parse(InputStream stream, ContentHandler
handler,
}
- private void readLines(AutoDetectReader reader) throws IOException,
SAXException {
- // text contents of the xhtml
- String line;
- while ((line = reader.readLine()) != null) {
- if (line.contains("{") && !line.endsWith("}") || line.startsWith("
")) {
- String completeField = parseMultiLineFieldValue(line);
- if (completeField != null) {
- writeParagraph(completeField);
- }
- } else {
- writeParagraph(line);
+ private void readLines(AutoDetectReader reader, Metadata metadata) throws
IOException, SAXException {
+ // text contents of the xhtml
+ String line;
+ while ((line = reader.readLine()) != null) {
+ if (line.contains("{") && !line.endsWith("}") || line.startsWith("
")) {
+ String completeField = parseMultiLineFieldValue(line);
+ if (completeField != null) {
+ writeParagraphAndSetMetadata(completeField, metadata);
+ }
+ } else {
+ writeParagraphAndSetMetadata(line, metadata);
}
- }
+ }
}
/*
- * Simple write a line to the XHTMLContentHandler
+ * Write a line to the XHTMLContentHandler and populate the key, value
into the Metadata
*/
- private void writeParagraph(String line) throws SAXException {
+ private void writeParagraphAndSetMetadata(String line, Metadata metadata)
throws SAXException {
+ if(line.length() < 150) {
+ String[] keyValue = line.split("=");
+ if(keyValue.length != 1) {
+ metadata.set("envi." + keyValue[0].trim().replace(" ", "."),
keyValue[1].trim());
+ }
+ }
xhtml.startElement("p");
xhtml.characters(line);
xhtml.endElement("p");
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Capture short fields (<150 chars) in EnviParserHeader Metadata
> --------------------------------------------------------------
>
> Key: TIKA-2762
> URL: https://issues.apache.org/jira/browse/TIKA-2762
> Project: Tika
> Issue Type: Improvement
> Components: parser
> Affects Versions: 1.19.1
> Reporter: Lewis John McGibbney
> Assignee: Lewis John McGibbney
> Priority: Major
> Fix For: 1.20
>
>
> I have always wanted to capture more metadata for the EnviHeader files. Right
> now everything is shoved into the records content and I think we could
> improve it.
> I've implemented a rudimentary parser improvement with essentially captures
> any reasonably sized lines items (<150 chars) which can then be populated up
> to Metadata level making faceted search over ENVI .hdr documents a much
> easier task.
> PR coming up.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)