Revision: 9205
http://languagetool.svn.sourceforge.net/languagetool/?rev=9205&view=rev
Author: dnaber
Date: 2013-01-24 20:24:27 +0000 (Thu, 24 Jan 2013)
Log Message:
-----------
copy the Main class (command line application) from old sources, it was still
missing
Added Paths:
-----------
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
Copied:
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
(from rev 9047, trunk/JLanguageTool/src/main/java/org/languagetool/Main.java)
===================================================================
---
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
(rev 0)
+++
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
2013-01-24 20:24:27 UTC (rev 9205)
@@ -0,0 +1,529 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package org.languagetool.commandline;
+
+import org.apache.tika.language.LanguageIdentifier;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.bitext.TabBitextReader;
+import org.languagetool.language.English;
+import org.languagetool.rules.Rule;
+import org.languagetool.rules.bitext.BitextRule;
+import org.languagetool.tools.StringTools;
+import org.languagetool.tools.Tools;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * The command line tool to check plain text files.
+ *
+ * @author Daniel Naber
+ */
+class Main {
+
+ /* maximum file size to read in a single read */
+ private static final int MAX_FILE_SIZE = 64000;
+
+ private final boolean verbose;
+ private final boolean apiFormat;
+ private final boolean taggerOnly;
+ private final boolean applySuggestions;
+ private final boolean autoDetect;
+ private final boolean singleLineBreakMarksParagraph;
+ private final String[] enabledRules;
+ private final String[] disabledRules;
+ private final Language motherTongue;
+
+ private JLanguageTool lt;
+ private boolean profileRules;
+ private boolean bitextMode;
+ private JLanguageTool srcLt;
+ private List<BitextRule> bRules;
+ private Rule currentRule;
+
+ Main(final boolean verbose, final boolean taggerOnly,
+ final Language language, final Language motherTongue,
+ final String[] disabledRules, final String[] enabledRules,
+ final boolean apiFormat, boolean applySuggestions,
+ boolean autoDetect, boolean singleLineBreakMarksParagraph) throws
IOException,
+ SAXException, ParserConfigurationException {
+ this.verbose = verbose;
+ this.apiFormat = apiFormat;
+ this.taggerOnly = taggerOnly;
+ this.applySuggestions = applySuggestions;
+ this.autoDetect = autoDetect;
+ this.enabledRules = enabledRules;
+ this.disabledRules = disabledRules;
+ this.motherTongue = motherTongue;
+ this.singleLineBreakMarksParagraph = singleLineBreakMarksParagraph;
+ profileRules = false;
+ bitextMode = false;
+ srcLt = null;
+ bRules = null;
+ lt = new JLanguageTool(language, motherTongue);
+ lt.activateDefaultPatternRules();
+ lt.activateDefaultFalseFriendRules();
+ Tools.selectRules(lt, disabledRules, enabledRules);
+ }
+
+ JLanguageTool getJLanguageTool() {
+ return lt;
+ }
+
+ private void setListUnknownWords(final boolean listUnknownWords) {
+ lt.setListUnknownWords(listUnknownWords);
+ }
+
+ private void cleanUp() {
+ JLanguageTool.removeTemporaryFiles();
+ }
+
+
+ private void setProfilingMode() {
+ profileRules = true;
+ }
+
+ private void setBitextMode(final Language sourceLang,
+ final String[] disabledRules, final String[] enabledRules) throws
IOException, ParserConfigurationException, SAXException {
+ bitextMode = true;
+ final Language target = lt.getLanguage();
+ lt = new JLanguageTool(target, null);
+ srcLt = new JLanguageTool(sourceLang);
+ lt.activateDefaultPatternRules();
+ Tools.selectRules(lt, disabledRules, enabledRules);
+ Tools.selectRules(srcLt, disabledRules, enabledRules);
+ bRules = Tools.getBitextRules(sourceLang, lt.getLanguage());
+
+ List<BitextRule> bRuleList = new ArrayList<BitextRule>(bRules);
+ for (final BitextRule bitextRule : bRules) {
+ for (final String disabledRule : disabledRules) {
+ if (bitextRule.getId().equals(disabledRule)) {
+ bRuleList.remove(bitextRule);
+ }
+ }
+ }
+ bRules = bRuleList;
+ if (enabledRules.length > 0) {
+ bRuleList = new ArrayList<BitextRule>();
+ for (final String enabledRule : enabledRules) {
+ for (final BitextRule bitextRule : bRules) {
+ if (bitextRule.getId().equals(enabledRule)) {
+ bRuleList.add(bitextRule);
+ }
+ }
+ }
+ bRules = bRuleList;
+ }
+ }
+
+ private void runOnFile(final String filename, final String encoding,
+ final boolean listUnknownWords, final boolean xmlFiltering) throws
IOException {
+ boolean oneTime = false;
+ if (!"-".equals(filename)) {
+ if (autoDetect) {
+ Language language = detectLanguageOfFile(filename, encoding);
+ if (language == null) {
+ System.err.println("Could not detect language well enough, using
English");
+ language = new English();
+ }
+ changeLanguage(language, motherTongue, disabledRules, enabledRules);
+ System.out.println("Using " + language.getName() + " for file " +
filename);
+ }
+ final File file = new File(filename);
+ // run once on file if the file size < MAX_FILE_SIZE or
+ // when we use the bitext mode (we use a bitext reader
+ // instead of a direct file access)
+ oneTime = file.length() < MAX_FILE_SIZE || bitextMode;
+ }
+ if (oneTime) {
+ runOnFileInOneGo(filename, encoding, listUnknownWords, xmlFiltering);
+ } else {
+ runOnFileLineByLine(filename, encoding, listUnknownWords);
+ }
+ }
+
+ private void runOnFileInOneGo(String filename, String encoding, boolean
listUnknownWords, boolean xmlFiltering) throws IOException {
+ if (bitextMode) {
+ //TODO: add parameter to set different readers
+ final TabBitextReader reader = new TabBitextReader(filename, encoding);
+ if (applySuggestions) {
+ Tools.correctBitext(reader, srcLt, lt, bRules);
+ } else {
+ Tools.checkBitext(reader, srcLt, lt, bRules, apiFormat);
+ }
+ } else {
+ final String text = getFilteredText(filename, encoding, xmlFiltering);
+ if (applySuggestions) {
+ System.out.print(Tools.correctText(text, lt));
+ } else if (profileRules) {
+ Tools.profileRulesOnText(text, lt);
+ } else if (!taggerOnly) {
+ Tools.checkText(text, lt, apiFormat, 0);
+ } else {
+ Tools.tagText(text, lt);
+ }
+ if (listUnknownWords) {
+ System.out.println("Unknown words: " + lt.getUnknownWords());
+ }
+ }
+ }
+
+ private void runOnFileLineByLine(String filename, String encoding, boolean
listUnknownWords) throws IOException {
+ if (verbose) {
+ lt.setOutput(System.err);
+ }
+ if (!apiFormat && !applySuggestions) {
+ if ("-".equals(filename)) {
+ System.out.println("Working on STDIN...");
+ } else {
+ System.out.println("Working on " + filename + "...");
+ }
+ }
+ int runCount = 1;
+ final List<Rule> rules = lt.getAllActiveRules();
+ if (profileRules) {
+ System.out.printf("Testing %d rules\n", rules.size());
+ System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per
sec.");
+ runCount = rules.size();
+ }
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ int lineOffset = 0;
+ int tmpLineOffset = 0;
+ final List<String> unknownWords = new ArrayList<String>();
+ StringBuilder sb = new StringBuilder();
+ for (int ruleIndex = 0; !rules.isEmpty() && ruleIndex < runCount;
ruleIndex++) {
+ currentRule = rules.get(ruleIndex);
+ int matches = 0;
+ long sentences = 0;
+ final long startTime = System.currentTimeMillis();
+ try {
+ isr = getInputStreamReader(filename, encoding, isr);
+ br = new BufferedReader(isr);
+ String line;
+ int lineCount = 0;
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ lineCount++;
+ // to detect language from the first input line
+ if (lineCount == 1 && autoDetect) {
+ Language language = detectLanguageOfString(line);
+ if (language == null) {
+ System.err.println("Could not detect language well enough, using
English");
+ language = new English();
+ }
+ System.out.println("Language used is: " + language.getName());
+ language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph(
+ singleLineBreakMarksParagraph);
+ changeLanguage(language, motherTongue, disabledRules,
enabledRules);
+ }
+ sb.append('\n');
+ tmpLineOffset++;
+ if
(lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara()) {
+ matches = handleLine(matches, lineOffset, sb);
+ sentences += lt.getSentenceCount();
+ if (profileRules) {
+ sentences += lt.sentenceTokenize(sb.toString()).size();
+ }
+ if (listUnknownWords && !taggerOnly) {
+ for (String word : lt.getUnknownWords())
+ if (!unknownWords.contains(word)) {
+ unknownWords.add(word);
+ }
+ }
+ sb = new StringBuilder();
+ lineOffset = tmpLineOffset;
+ } else {
+ if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) {
+ matches = handleLine(matches, lineOffset, sb);
+ sentences += lt.getSentenceCount();
+ if (profileRules) {
+ sentences += lt.sentenceTokenize(sb.toString()).size();
+ }
+ if (listUnknownWords && !taggerOnly) {
+ for (String word : lt.getUnknownWords())
+ if (!unknownWords.contains(word)) {
+ unknownWords.add(word);
+ }
+ }
+ sb = new StringBuilder();
+ lineOffset = tmpLineOffset;
+ }
+ }
+ }
+ } finally {
+ if (sb.length() > 0) {
+ matches = handleLine(matches, tmpLineOffset - 1, sb);
+ sentences += lt.getSentenceCount();
+ if (profileRules) {
+ sentences += lt.sentenceTokenize(sb.toString()).size();
+ }
+ if (apiFormat && !taggerOnly && !applySuggestions) {
+ System.out.println("</matches>");
+ }
+ if (listUnknownWords && !taggerOnly) {
+ for (String word : lt.getUnknownWords()) {
+ if (!unknownWords.contains(word)) {
+ unknownWords.add(word);
+ }
+ }
+ }
+ }
+ printTimingInformation(listUnknownWords, rules, unknownWords,
ruleIndex, matches, sentences, startTime);
+ if (br != null) {
+ br.close();
+ }
+ if (isr != null) {
+ isr.close();
+ }
+ }
+ }
+ }
+
+ private InputStreamReader getInputStreamReader(String filename, String
encoding, InputStreamReader isr)
+ throws UnsupportedEncodingException, FileNotFoundException {
+ if (!"-".equals(filename)) {
+ final File file = new File(filename);
+ if (encoding != null) {
+ isr = new InputStreamReader(new BufferedInputStream(
+ new FileInputStream(file.getAbsolutePath())), encoding);
+ } else {
+ isr = new InputStreamReader(new BufferedInputStream(
+ new FileInputStream(file.getAbsolutePath())));
+ }
+ } else {
+ if (encoding != null) {
+ isr = new InputStreamReader(new BufferedInputStream(System.in),
encoding);
+ } else {
+ isr = new InputStreamReader(new BufferedInputStream(System.in));
+ }
+ }
+ return isr;
+ }
+
+ private void printTimingInformation(final boolean listUnknownWords, final
List<Rule> rules,
+ final List<String> unknownWords, final int ruleIndex, final int matches,
final long sentences, final long startTime) {
+ if (!applySuggestions) {
+ final long endTime = System.currentTimeMillis();
+ final long time = endTime - startTime;
+ final float timeInSeconds = time / 1000.0f;
+ final float sentencesPerSecond = sentences / timeInSeconds;
+ if (apiFormat) {
+ System.out.println("<!--");
+ }
+ if (profileRules) {
+ //TODO: run 10 times, line in runOnce mode, and use median
+ System.out.printf(Locale.ENGLISH,
+ "%s\t%d\t%d\t%d\t%.1f", rules.get(ruleIndex).getId(),
+ time, sentences, matches, sentencesPerSecond);
+ System.out.println();
+ } else {
+ System.out.printf(Locale.ENGLISH,
+ "Time: %dms for %d sentences (%.1f sentences/sec)", time,
+ sentences, sentencesPerSecond);
+ System.out.println();
+ }
+ if (listUnknownWords) {
+ Collections.sort(unknownWords);
+ System.out.println("Unknown words: " + unknownWords);
+ }
+ if (apiFormat) {
+ System.out.println("-->");
+ }
+ }
+ }
+
+ private int handleLine(final int matchNo, final int lineOffset,
+ final StringBuilder sb) throws IOException {
+ int matches = matchNo;
+ if (applySuggestions) {
+ System.out.print(Tools.correctText(StringTools.filterXML(sb.toString()),
+ lt));
+ } else if (profileRules) {
+ matches +=
Tools.profileRulesOnLine(StringTools.filterXML(sb.toString()),
+ lt, currentRule);
+ } else if (!taggerOnly) {
+ if (matches == 0) {
+ matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt,
+ apiFormat, -1, lineOffset, matches,
+ StringTools.XmlPrintMode.START_XML);
+ } else {
+ matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt,
+ apiFormat, -1, lineOffset, matches,
+ StringTools.XmlPrintMode.CONTINUE_XML);
+ }
+ } else {
+ Tools.tagText(StringTools.filterXML(sb.toString()), lt);
+ }
+ return matches;
+ }
+
+ private void runRecursive(final String filename, final String encoding,
+ final boolean listUnknown, final boolean xmlFiltering) throws
IOException, ParserConfigurationException, SAXException {
+ final File dir = new File(filename);
+ if (!dir.isDirectory()) {
+ throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a
directory, cannot use recursion");
+ }
+ final File[] files = dir.listFiles();
+ for (final File file : files) {
+ if (file.isDirectory()) {
+ runRecursive(file.getAbsolutePath(), encoding, listUnknown,
xmlFiltering);
+ } else {
+ runOnFile(file.getAbsolutePath(), encoding, listUnknown, xmlFiltering);
+ }
+ }
+ }
+
+ /**
+ * Loads filename and filters out XML. Note that the XML
+ * filtering can lead to incorrect positions in the list of matching rules.
+ */
+ private String getFilteredText(final String filename, final String encoding,
boolean xmlFiltering) throws IOException {
+ if (verbose) {
+ lt.setOutput(System.err);
+ }
+ if (!apiFormat && !applySuggestions) {
+ System.out.println("Working on " + filename + "...");
+ }
+ final String fileContents = StringTools.readFile(new
FileInputStream(filename), encoding);
+ if (xmlFiltering) {
+ return StringTools.filterXML(fileContents);
+ } else {
+ return fileContents;
+ }
+ }
+
+ private void changeLanguage(Language language, Language motherTongue,
+ String[] disabledRules, String[] enabledRules) {
+ try {
+ lt = new JLanguageTool(language, motherTongue);
+ lt.activateDefaultPatternRules();
+ lt.activateDefaultFalseFriendRules();
+ Tools.selectRules(lt, disabledRules, enabledRules);
+ if (verbose) {
+ lt.setOutput(System.err);
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Could not create LanguageTool instance for
language " + language, e);
+ }
+ }
+
+ /**
+ * Command line tool to check plain text files.
+ */
+ public static void main(final String[] args) throws IOException,
ParserConfigurationException, SAXException {
+ final CommandLineParser commandLineParser = new CommandLineParser();
+ CommandLineOptions options = null;
+ try {
+ options = commandLineParser.parseOptions(args);
+ } catch (WrongParameterNumberException e) {
+ commandLineParser.printUsage();
+ System.exit(1);
+ } catch (IllegalArgumentException e) {
+ System.err.println(e.toString());
+ System.exit(1);
+ } catch (UnknownParameterException e) {
+ if (e.getMessage() != null) {
+ System.err.println(e.getMessage());
+ } else {
+ System.err.println(e.toString());
+ }
+ commandLineParser.printUsage(System.err);
+ System.exit(1);
+ }
+ if (options.isPrintUsage()) {
+ commandLineParser.printUsage();
+ System.exit(1);
+ }
+ if (options.isPrintVersion()) {
+ System.out.println("LanguageTool version " + JLanguageTool.VERSION);
+ System.exit(0);
+ }
+ if (options.isPrintLanguages()) {
+ printLanguages();
+ System.exit(0);
+ }
+
+ if (options.getFilename() == null) {
+ options.setFilename("-");
+ }
+
+ if (options.getLanguage() == null) {
+ if (!options.isApiFormat() && !options.isAutoDetect()) {
+ System.err.println("No language specified, using English");
+ }
+ options.setLanguage(new English());
+ } else if (!options.isApiFormat() && !options.isApplySuggestions()) {
+ System.out.println("Expected text language: " +
options.getLanguage().getName());
+ }
+
+
options.getLanguage().getSentenceTokenizer().setSingleLineBreaksMarksParagraph(
+ options.isSingleLineBreakMarksParagraph());
+ final Main prg = new Main(options.isVerbose(), options.isTaggerOnly(),
options.getLanguage(), options.getMotherTongue(),
+ options.getDisabledRules(), options.getEnabledRules(),
options.isApiFormat(), options.isApplySuggestions(),
+ options.isAutoDetect(), options.isSingleLineBreakMarksParagraph());
+ prg.setListUnknownWords(options.isListUnknown());
+ if (options.isProfile()) {
+ prg.setProfilingMode();
+ }
+ if (options.isBitext()) {
+ if (options.getMotherTongue() == null) {
+ throw new IllegalArgumentException("You have to set the source
language (as mother tongue) in bitext mode.");
+ }
+ prg.setBitextMode(options.getMotherTongue(), options.getDisabledRules(),
options.getEnabledRules());
+ }
+ if (options.isRecursive()) {
+ prg.runRecursive(options.getFilename(), options.getEncoding(),
options.isListUnknown(), options.isXmlFiltering());
+ } else {
+ prg.runOnFile(options.getFilename(), options.getEncoding(),
options.isListUnknown(), options.isXmlFiltering());
+ }
+ prg.cleanUp();
+ }
+
+ private static void printLanguages() {
+ final List<String> languages = new ArrayList<String>();
+ for (Language language : Language.REAL_LANGUAGES) {
+ languages.add(language.getShortNameWithVariant() + " " +
language.getName());
+ }
+ Collections.sort(languages);
+ for (String s : languages) {
+ System.out.println(s);
+ }
+ }
+
+ // for language auto detect
+ // TODO: alter tika's language profiles so they are in line with LT's
supported languages
+ private static Language detectLanguageOfFile(final String filename, final
String encoding) throws IOException {
+ final String text = StringTools.readFile(new FileInputStream(filename),
encoding);
+ return detectLanguageOfString(text);
+ }
+
+ private static Language detectLanguageOfString(final String text) {
+ final LanguageIdentifier identifier = new LanguageIdentifier(text);
+ final Language lang =
Language.getLanguageForShortName(identifier.getLanguage());
+ return lang;
+ }
+
+}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Master Visual Studio, SharePoint, SQL, ASP.NET, C# 2012, HTML5, CSS,
MVC, Windows 8 Apps, JavaScript and much more. Keep your skills current
with LearnDevNow - 3,200 step-by-step video tutorials by Microsoft
MVPs and experts. ON SALE this month only -- learn more at:
http://p.sf.net/sfu/learnnow-d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits