Revision: 6741
http://languagetool.svn.sourceforge.net/languagetool/?rev=6741&view=rev
Author: dnaber
Date: 2012-04-17 21:45:39 +0000 (Tue, 17 Apr 2012)
Log Message:
-----------
Improved startup speed by using Sax parser and new segment library (Jarek
Lipski)
Modified Paths:
--------------
trunk/JLanguageTool/CHANGES.txt
trunk/JLanguageTool/build.properties
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/SRXSentenceTokenizer.java
Added Paths:
-----------
trunk/JLanguageTool/libs/segment-1.3.8.jar
Removed Paths:
-------------
trunk/JLanguageTool/libs/segment-1.3.0.jar
Modified: trunk/JLanguageTool/CHANGES.txt
===================================================================
--- trunk/JLanguageTool/CHANGES.txt 2012-04-17 00:19:26 UTC (rev 6740)
+++ trunk/JLanguageTool/CHANGES.txt 2012-04-17 21:45:39 UTC (rev 6741)
@@ -10,7 +10,9 @@
-GUI: made the result of "Tag Text" more readable
+ -Improved startup speed (Jarek Lipski)
+
1.7 (2012-03-25)
-English
Modified: trunk/JLanguageTool/build.properties
===================================================================
--- trunk/JLanguageTool/build.properties 2012-04-17 00:19:26 UTC (rev
6740)
+++ trunk/JLanguageTool/build.properties 2012-04-17 21:45:39 UTC (rev
6741)
@@ -29,7 +29,7 @@
ext.morfologik.fsa.lib = ${ext.dir}/${morfologik.fsa.lib}
ext.morfologik.stemming.lib = ${ext.dir}/${morfologik.stemming.lib}
ext.jwordsplitter.lib = ${ext.dir}/jWordSplitter.jar
-segment.lib = segment-1.3.0.jar
+segment.lib = segment-1.3.8.jar
ext.segment.lib = ${ext.dir}/${segment.lib}
logging.lib = commons-logging-1.1.1.jar
ext.logging.lib = ${ext.dir}/${logging.lib}
Deleted: trunk/JLanguageTool/libs/segment-1.3.0.jar
===================================================================
(Binary files differ)
Added: trunk/JLanguageTool/libs/segment-1.3.8.jar
===================================================================
(Binary files differ)
Property changes on: trunk/JLanguageTool/libs/segment-1.3.8.jar
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Modified:
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/SRXSentenceTokenizer.java
===================================================================
---
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/SRXSentenceTokenizer.java
2012-04-17 00:19:26 UTC (rev 6740)
+++
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/SRXSentenceTokenizer.java
2012-04-17 21:45:39 UTC (rev 6741)
@@ -19,43 +19,65 @@
package org.languagetool.tokenizers;
import java.io.BufferedReader;
+import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import net.sourceforge.segment.TextIterator;
import net.sourceforge.segment.srx.SrxDocument;
import net.sourceforge.segment.srx.SrxParser;
import net.sourceforge.segment.srx.SrxTextIterator;
-import net.sourceforge.segment.srx.io.Srx2Parser;
+import net.sourceforge.segment.srx.io.Srx2SaxParser;
+
import org.languagetool.JLanguageTool;
/**
* Class to tokenize sentences using an SRX file.
*
* @author Marcin MiĆkowski
- *
+ * @author Jarek Lipski
*/
public class SRXSentenceTokenizer extends SentenceTokenizer {
-
- private final BufferedReader srxReader;
- private final SrxDocument document;
+
private final String language;
private String parCode;
- static final String RULES = "/segment.srx";
+ private static final String RULES = "/segment.srx";
+
+ private static final SrxDocument document = createSrxDocument();
- public SRXSentenceTokenizer(final String language) {
- this.language = language;
+ private static SrxDocument createSrxDocument() {
+ BufferedReader srxReader = null;
try {
srxReader = new BufferedReader(new InputStreamReader(
-
JLanguageTool.getDataBroker().getFromResourceDirAsStream(RULES), "utf-8"));
- } catch (Exception e) {
+ JLanguageTool.getDataBroker().getFromResourceDirAsStream(RULES),
"utf-8"));
+
+ Map<String, Object> parserParameters = new HashMap<String, Object>();
+ parserParameters.put(Srx2SaxParser.VALIDATE_PARAMETER, true);
+ SrxParser srxParser = new Srx2SaxParser(parserParameters);
+
+ SrxDocument document = srxParser.parse(srxReader);
+ return document;
+ } catch (IOException e) {
throw new RuntimeException("Could not load rules " + RULES + " from
resource dir "
- + JLanguageTool.getDataBroker().getResourceDir(), e);
+ + JLanguageTool.getDataBroker().getResourceDir(), e);
+ } finally {
+ if (srxReader != null) {
+ try {
+ srxReader.close();
+ } catch (IOException e) {
+ // can't throw exception in final block, so logging an error.
+ System.err.println("Error closing SRX file reader.");
+ }
+ }
}
- final SrxParser srxParser = new Srx2Parser();
- document = srxParser.parse(srxReader);
+ }
+
+ public SRXSentenceTokenizer(final String language) {
+ this.language = language;
setSingleLineBreaksMarksParagraph(false);
}
@@ -91,12 +113,4 @@
}
}
- @Override
- protected final void finalize() throws Throwable {
- if (srxReader != null) {
- srxReader.close();
- }
- super.finalize();
- }
-
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Better than sec? Nothing is better than sec when it comes to
monitoring Big Data applications. Try Boundary one-second
resolution app monitoring today. Free.
http://p.sf.net/sfu/Boundary-dev2dev
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs