Main.java

dnaber Thu, 24 Jan 2013 12:24:42 -0800

Revision: 9205
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=9205&view=rev
Author:   dnaber
Date:     2013-01-24 20:24:27 +0000 (Thu, 24 Jan 2013)
Log Message:
-----------
copy the Main class (command line application) from old sources, it was still 
missing


Added Paths:
-----------
    
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java

Copied: 
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
 (from rev 9047, trunk/JLanguageTool/src/main/java/org/languagetool/Main.java)
===================================================================
--- 
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
                             (rev 0)
+++ 
trunk/languagetool/languagetool-standalone/src/main/java/org/languagetool/commandline/Main.java
     2013-01-24 20:24:27 UTC (rev 9205)
@@ -0,0 +1,529 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.commandline;
+
+import org.apache.tika.language.LanguageIdentifier;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.bitext.TabBitextReader;
+import org.languagetool.language.English;
+import org.languagetool.rules.Rule;
+import org.languagetool.rules.bitext.BitextRule;
+import org.languagetool.tools.StringTools;
+import org.languagetool.tools.Tools;
+import org.xml.sax.SAXException;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * The command line tool to check plain text files.
+ * 
+ * @author Daniel Naber
+ */
+class Main {
+
+  /* maximum file size to read in a single read */
+  private static final int MAX_FILE_SIZE = 64000;
+
+  private final boolean verbose;
+  private final boolean apiFormat;
+  private final boolean taggerOnly;
+  private final boolean applySuggestions;
+  private final boolean autoDetect;
+  private final boolean singleLineBreakMarksParagraph;
+  private final String[] enabledRules;
+  private final String[] disabledRules;
+  private final Language motherTongue;
+  
+  private JLanguageTool lt;
+  private boolean profileRules;
+  private boolean bitextMode;
+  private JLanguageTool srcLt;
+  private List<BitextRule> bRules;
+  private Rule currentRule;
+
+  Main(final boolean verbose, final boolean taggerOnly,
+      final Language language, final Language motherTongue,
+      final String[] disabledRules, final String[] enabledRules,
+      final boolean apiFormat, boolean applySuggestions, 
+      boolean autoDetect, boolean singleLineBreakMarksParagraph) throws 
IOException,
+      SAXException, ParserConfigurationException {
+    this.verbose = verbose;
+    this.apiFormat = apiFormat;
+    this.taggerOnly = taggerOnly;
+    this.applySuggestions = applySuggestions;
+    this.autoDetect = autoDetect;
+    this.enabledRules = enabledRules;
+    this.disabledRules = disabledRules;
+    this.motherTongue = motherTongue;
+    this.singleLineBreakMarksParagraph = singleLineBreakMarksParagraph;
+    profileRules = false;
+    bitextMode = false;
+    srcLt = null;
+    bRules = null;
+    lt = new JLanguageTool(language, motherTongue);
+    lt.activateDefaultPatternRules();
+    lt.activateDefaultFalseFriendRules();
+    Tools.selectRules(lt, disabledRules, enabledRules);
+  }
+
+  JLanguageTool getJLanguageTool() {
+    return lt;
+  }
+  
+  private void setListUnknownWords(final boolean listUnknownWords) {
+    lt.setListUnknownWords(listUnknownWords);
+  }
+  
+  private void cleanUp() {
+    JLanguageTool.removeTemporaryFiles();
+  }
+  
+
+  private void setProfilingMode() {
+    profileRules = true;
+  }
+
+  private void setBitextMode(final Language sourceLang,
+      final String[] disabledRules, final String[] enabledRules) throws 
IOException, ParserConfigurationException, SAXException {
+    bitextMode = true;
+    final Language target = lt.getLanguage();
+    lt = new JLanguageTool(target, null);
+    srcLt = new JLanguageTool(sourceLang);
+    lt.activateDefaultPatternRules();
+    Tools.selectRules(lt, disabledRules, enabledRules);
+    Tools.selectRules(srcLt, disabledRules, enabledRules);
+    bRules = Tools.getBitextRules(sourceLang, lt.getLanguage());
+
+    List<BitextRule> bRuleList = new ArrayList<BitextRule>(bRules);
+    for (final BitextRule bitextRule : bRules) {
+      for (final String disabledRule : disabledRules) {
+        if (bitextRule.getId().equals(disabledRule)) {
+          bRuleList.remove(bitextRule);
+        }
+      }
+    }
+    bRules = bRuleList;
+    if (enabledRules.length > 0) {
+      bRuleList = new ArrayList<BitextRule>();
+      for (final String enabledRule : enabledRules) {
+        for (final BitextRule bitextRule : bRules) {
+          if (bitextRule.getId().equals(enabledRule)) {
+            bRuleList.add(bitextRule);
+          }
+        }
+      }
+      bRules = bRuleList;
+    }
+  }
+
+  private void runOnFile(final String filename, final String encoding,
+      final boolean listUnknownWords, final boolean xmlFiltering) throws 
IOException {
+    boolean oneTime = false;
+    if (!"-".equals(filename)) {
+      if (autoDetect) {
+        Language language = detectLanguageOfFile(filename, encoding);
+        if (language == null) {
+          System.err.println("Could not detect language well enough, using 
English");
+          language = new English();
+        }
+        changeLanguage(language, motherTongue, disabledRules, enabledRules);
+        System.out.println("Using " + language.getName() + " for file " + 
filename);
+      }
+      final File file = new File(filename);
+      // run once on file if the file size < MAX_FILE_SIZE or
+      // when we use the bitext mode (we use a bitext reader
+      // instead of a direct file access)
+      oneTime = file.length() < MAX_FILE_SIZE || bitextMode;
+    }
+    if (oneTime) {
+      runOnFileInOneGo(filename, encoding, listUnknownWords, xmlFiltering);
+    } else {
+      runOnFileLineByLine(filename, encoding, listUnknownWords);
+    }
+  }
+
+  private void runOnFileInOneGo(String filename, String encoding, boolean 
listUnknownWords, boolean xmlFiltering) throws IOException {
+    if (bitextMode) {
+      //TODO: add parameter to set different readers
+      final TabBitextReader reader = new TabBitextReader(filename, encoding);
+      if (applySuggestions) {
+        Tools.correctBitext(reader, srcLt, lt, bRules);
+      } else {
+        Tools.checkBitext(reader, srcLt, lt, bRules, apiFormat);
+      }
+    } else {
+      final String text = getFilteredText(filename, encoding, xmlFiltering);
+      if (applySuggestions) {
+        System.out.print(Tools.correctText(text, lt));
+      } else if (profileRules) {
+        Tools.profileRulesOnText(text, lt);
+      } else if (!taggerOnly) {
+        Tools.checkText(text, lt, apiFormat, 0);
+      } else {
+        Tools.tagText(text, lt);
+      }
+      if (listUnknownWords) {
+        System.out.println("Unknown words: " + lt.getUnknownWords());
+      }
+    }
+  }
+
+  private void runOnFileLineByLine(String filename, String encoding, boolean 
listUnknownWords) throws IOException {
+    if (verbose) {
+      lt.setOutput(System.err);
+    }
+    if (!apiFormat && !applySuggestions) {
+      if ("-".equals(filename)) {
+        System.out.println("Working on STDIN...");
+      } else {
+        System.out.println("Working on " + filename + "...");
+      }
+    }
+    int runCount = 1;
+    final List<Rule> rules = lt.getAllActiveRules();
+    if (profileRules) {
+      System.out.printf("Testing %d rules\n", rules.size());
+      System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per 
sec.");
+      runCount = rules.size();
+    }
+    InputStreamReader isr = null;
+    BufferedReader br = null;
+    int lineOffset = 0;
+    int tmpLineOffset = 0;
+    final List<String> unknownWords = new ArrayList<String>();
+    StringBuilder sb = new StringBuilder();
+    for (int ruleIndex = 0; !rules.isEmpty() && ruleIndex < runCount; 
ruleIndex++) {
+      currentRule = rules.get(ruleIndex);
+      int matches = 0;
+      long sentences = 0;
+      final long startTime = System.currentTimeMillis();
+      try {
+        isr = getInputStreamReader(filename, encoding, isr);
+        br = new BufferedReader(isr);
+        String line;
+        int lineCount = 0;
+        while ((line = br.readLine()) != null) {
+          sb.append(line);
+          lineCount++;    
+          // to detect language from the first input line
+          if (lineCount == 1 && autoDetect) {
+            Language language = detectLanguageOfString(line);
+            if (language == null) {
+              System.err.println("Could not detect language well enough, using 
English");
+              language = new English();
+            }
+            System.out.println("Language used is: " + language.getName());
+            language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph(
+                    singleLineBreakMarksParagraph);
+            changeLanguage(language, motherTongue, disabledRules, 
enabledRules);
+          }
+          sb.append('\n');
+          tmpLineOffset++;
+          if 
(lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara()) {
+            matches = handleLine(matches, lineOffset, sb);
+            sentences += lt.getSentenceCount();
+            if (profileRules) {
+              sentences += lt.sentenceTokenize(sb.toString()).size();
+            }
+            if (listUnknownWords && !taggerOnly) {
+              for (String word : lt.getUnknownWords())
+                if (!unknownWords.contains(word)) {
+                  unknownWords.add(word);
+                }
+            }
+            sb = new StringBuilder();
+            lineOffset = tmpLineOffset;
+          } else {
+            if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) {
+              matches = handleLine(matches, lineOffset, sb);
+              sentences += lt.getSentenceCount();
+              if (profileRules) {
+                sentences += lt.sentenceTokenize(sb.toString()).size();
+              }
+              if (listUnknownWords && !taggerOnly) {
+                for (String word : lt.getUnknownWords())
+                  if (!unknownWords.contains(word)) {
+                    unknownWords.add(word);
+                  }
+              }
+              sb = new StringBuilder();
+              lineOffset = tmpLineOffset;
+            }
+          }
+        }
+      } finally {
+        if (sb.length() > 0) {
+          matches = handleLine(matches, tmpLineOffset - 1, sb);
+          sentences += lt.getSentenceCount();
+          if (profileRules) {
+            sentences += lt.sentenceTokenize(sb.toString()).size();
+          }
+          if (apiFormat && !taggerOnly && !applySuggestions) {
+              System.out.println("</matches>");
+          }
+          if (listUnknownWords && !taggerOnly) {
+            for (String word : lt.getUnknownWords()) {
+              if (!unknownWords.contains(word)) {
+                unknownWords.add(word);
+              }
+            }
+          }
+        }
+        printTimingInformation(listUnknownWords, rules, unknownWords, 
ruleIndex, matches, sentences, startTime);
+        if (br != null) {
+          br.close();
+        }
+        if (isr != null) {
+          isr.close();
+        }
+      }
+    }
+  }
+
+  private InputStreamReader getInputStreamReader(String filename, String 
encoding, InputStreamReader isr)
+          throws UnsupportedEncodingException, FileNotFoundException {
+    if (!"-".equals(filename)) {
+      final File file = new File(filename);
+      if (encoding != null) {
+        isr = new InputStreamReader(new BufferedInputStream(
+            new FileInputStream(file.getAbsolutePath())), encoding);
+      } else {
+        isr = new InputStreamReader(new BufferedInputStream(
+            new FileInputStream(file.getAbsolutePath())));
+      }
+    } else {
+      if (encoding != null) {
+        isr = new InputStreamReader(new BufferedInputStream(System.in), 
encoding);
+      } else {
+        isr = new InputStreamReader(new BufferedInputStream(System.in));
+      }
+    }
+    return isr;
+  }
+
+  private void printTimingInformation(final boolean listUnknownWords, final 
List<Rule> rules,
+      final List<String> unknownWords, final int ruleIndex, final int matches, 
final long sentences, final long startTime) {
+    if (!applySuggestions) {
+      final long endTime = System.currentTimeMillis();
+      final long time = endTime - startTime;
+      final float timeInSeconds = time / 1000.0f;
+      final float sentencesPerSecond = sentences / timeInSeconds;
+      if (apiFormat) {
+        System.out.println("<!--");
+      }
+      if (profileRules) {
+        //TODO: run 10 times, line in runOnce mode, and use median
+        System.out.printf(Locale.ENGLISH,
+            "%s\t%d\t%d\t%d\t%.1f", rules.get(ruleIndex).getId(),
+            time, sentences, matches, sentencesPerSecond);
+        System.out.println();
+      } else {
+        System.out.printf(Locale.ENGLISH,
+            "Time: %dms for %d sentences (%.1f sentences/sec)", time,
+            sentences, sentencesPerSecond);
+        System.out.println();
+      }
+      if (listUnknownWords) {
+        Collections.sort(unknownWords);
+        System.out.println("Unknown words: " + unknownWords);
+      }
+      if (apiFormat) {
+        System.out.println("-->");
+      }
+    }
+  }
+
+  private int handleLine(final int matchNo, final int lineOffset,
+      final StringBuilder sb) throws IOException {
+    int matches = matchNo;
+    if (applySuggestions) {
+      System.out.print(Tools.correctText(StringTools.filterXML(sb.toString()),
+          lt));
+    } else if (profileRules) {
+      matches += 
Tools.profileRulesOnLine(StringTools.filterXML(sb.toString()), 
+          lt, currentRule);
+    } else if (!taggerOnly) {
+      if (matches == 0) {
+        matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt,
+            apiFormat, -1, lineOffset, matches,
+            StringTools.XmlPrintMode.START_XML);
+      } else {
+        matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt,
+            apiFormat, -1, lineOffset, matches,
+            StringTools.XmlPrintMode.CONTINUE_XML);
+      }
+    } else {
+      Tools.tagText(StringTools.filterXML(sb.toString()), lt);
+    }
+    return matches;
+  }
+
+  private void runRecursive(final String filename, final String encoding,
+      final boolean listUnknown, final boolean xmlFiltering) throws 
IOException, ParserConfigurationException, SAXException {
+    final File dir = new File(filename);
+    if (!dir.isDirectory()) {
+      throw new IllegalArgumentException(dir.getAbsolutePath() + " is not a 
directory, cannot use recursion");
+    }
+    final File[] files = dir.listFiles();
+    for (final File file : files) {
+      if (file.isDirectory()) {
+        runRecursive(file.getAbsolutePath(), encoding, listUnknown, 
xmlFiltering);
+      } else {
+        runOnFile(file.getAbsolutePath(), encoding, listUnknown, xmlFiltering);
+      }
+    }    
+  }
+
+  /**
+   * Loads filename and filters out XML. Note that the XML
+   * filtering can lead to incorrect positions in the list of matching rules.
+   */
+  private String getFilteredText(final String filename, final String encoding, 
boolean xmlFiltering) throws IOException {
+    if (verbose) {
+      lt.setOutput(System.err);
+    }
+    if (!apiFormat && !applySuggestions) {
+      System.out.println("Working on " + filename + "...");
+    }
+    final String fileContents = StringTools.readFile(new 
FileInputStream(filename), encoding);
+    if (xmlFiltering) {
+      return StringTools.filterXML(fileContents);
+    } else {
+      return fileContents;
+    }
+  }
+
+  private void changeLanguage(Language language, Language motherTongue,
+                              String[] disabledRules, String[] enabledRules) {
+    try {
+      lt = new JLanguageTool(language, motherTongue);
+      lt.activateDefaultPatternRules();
+      lt.activateDefaultFalseFriendRules();
+      Tools.selectRules(lt, disabledRules, enabledRules);
+      if (verbose) {
+        lt.setOutput(System.err);
+      }
+    } catch (Exception e) {
+      throw new RuntimeException("Could not create LanguageTool instance for 
language " + language, e);
+    }
+  }
+
+  /**
+   * Command line tool to check plain text files.
+   */
+  public static void main(final String[] args) throws IOException, 
ParserConfigurationException, SAXException {
+    final CommandLineParser commandLineParser = new CommandLineParser();
+    CommandLineOptions options = null;
+    try {
+       options = commandLineParser.parseOptions(args);
+    } catch (WrongParameterNumberException e) {
+      commandLineParser.printUsage();
+      System.exit(1);
+    } catch (IllegalArgumentException e) {
+      System.err.println(e.toString());
+      System.exit(1);
+    } catch (UnknownParameterException e) {
+      if (e.getMessage() != null) {
+        System.err.println(e.getMessage());
+      } else {
+        System.err.println(e.toString());
+      }
+      commandLineParser.printUsage(System.err);
+      System.exit(1);
+    }
+    if (options.isPrintUsage()) {
+      commandLineParser.printUsage();
+      System.exit(1);
+    }
+    if (options.isPrintVersion()) {
+      System.out.println("LanguageTool version " + JLanguageTool.VERSION);
+      System.exit(0);
+    }
+    if (options.isPrintLanguages()) {
+      printLanguages();
+      System.exit(0);
+    }
+
+    if (options.getFilename() == null) {
+      options.setFilename("-");
+    }
+
+    if (options.getLanguage() == null) {
+      if (!options.isApiFormat() && !options.isAutoDetect()) {
+        System.err.println("No language specified, using English");
+      }
+      options.setLanguage(new English());
+    } else if (!options.isApiFormat() && !options.isApplySuggestions()) {
+      System.out.println("Expected text language: " + 
options.getLanguage().getName());
+    }
+
+    
options.getLanguage().getSentenceTokenizer().setSingleLineBreaksMarksParagraph(
+            options.isSingleLineBreakMarksParagraph());
+    final Main prg = new Main(options.isVerbose(), options.isTaggerOnly(), 
options.getLanguage(), options.getMotherTongue(),
+            options.getDisabledRules(), options.getEnabledRules(), 
options.isApiFormat(), options.isApplySuggestions(),
+            options.isAutoDetect(), options.isSingleLineBreakMarksParagraph());
+    prg.setListUnknownWords(options.isListUnknown());
+    if (options.isProfile()) {
+      prg.setProfilingMode();
+    }
+    if (options.isBitext()) {
+      if (options.getMotherTongue() == null) {
+        throw new IllegalArgumentException("You have to set the source 
language (as mother tongue) in bitext mode.");
+      }
+      prg.setBitextMode(options.getMotherTongue(), options.getDisabledRules(), 
options.getEnabledRules());
+    }
+    if (options.isRecursive()) {
+      prg.runRecursive(options.getFilename(), options.getEncoding(), 
options.isListUnknown(), options.isXmlFiltering());
+    } else {
+      prg.runOnFile(options.getFilename(), options.getEncoding(), 
options.isListUnknown(), options.isXmlFiltering());
+    }
+    prg.cleanUp();
+  }
+
+  private static void printLanguages() {
+    final List<String> languages = new ArrayList<String>();
+    for (Language language : Language.REAL_LANGUAGES) {
+      languages.add(language.getShortNameWithVariant() + " " + 
language.getName());
+    }
+    Collections.sort(languages);
+    for (String s : languages) {
+      System.out.println(s);
+    }
+  }
+
+  // for language auto detect
+  // TODO: alter tika's language profiles so they are in line with LT's 
supported languages
+  private static Language detectLanguageOfFile(final String filename, final 
String encoding) throws IOException {
+    final String text = StringTools.readFile(new FileInputStream(filename), 
encoding);
+    return detectLanguageOfString(text);
+  }
+
+  private static Language detectLanguageOfString(final String text) {
+    final LanguageIdentifier identifier = new LanguageIdentifier(text);
+    final Language lang = 
Language.getLanguageForShortName(identifier.getLanguage());
+    return lang;
+  }
+
+}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Master Visual Studio, SharePoint, SQL, ASP.NET, C# 2012, HTML5, CSS,
MVC, Windows 8 Apps, JavaScript and much more. Keep your skills current
with LearnDevNow - 3,200 step-by-step video tutorials by Microsoft
MVPs and experts. ON SALE this month only -- learn more at:
http://p.sf.net/sfu/learnnow-d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits

SF.net SVN: languagetool:[9205] trunk/languagetool/languagetool-standalone/ src/main/java/org/languagetool/commandline/Main.java

Reply via email to