On 5/2/2011 9:47 AM, Jim Idle wrote:
I suspect that you are approaching this problem incorrectly in some way.
Why do you feel you need to specify a new token at the AST stage? Why
don't you restate your goal, ignoring what you have done so far - I
suspect that we may be trying to solve a problem that you should not have.
Certainly. I was trying to keep things simple/short, but I can expand.
My project is a NLP tokenizer/parser. The first stage of functionality
is implemented the FuzzyLexer and FuzzyParser grammars. They strip out
all punctuation and white space, preserving them as tokens and grouping
all the text between the punctuation/white space as "unspecified" tokens.
Stage 1.5 is the language-specific composite grammar (Sentential.g),
which imports the Fuzzy* grammars. Here, I implement regular
expressions used in semantic predicates that attempt to categorize
"unspecified" tokens into relevant categories (see also,
LongNumber.java). For instance, the string "one" would be cast as a
long form number token. Any "unspecified" tokens that don't match any
semantic predicates stay "unspecified" tokens.
Stage 2, which is yet to be written, walks the AST output by stage 1.5
and wraps the tokens up into an application-specific data structure.
This tree grammar will also perform tasks such as clustering together
numbers into one single number, etc.
Courtney Falk
[email protected]
lexer grammar FuzzyLexer;
options {
filter=UNSPECIFIED;
k=2;
}
@members {
private StringBuilder unknown;
{
unknown = new StringBuilder();
}
public void appendUnknown(char c) {
unknown.append(c);
}
public String getUnknown() {
String result = unknown.toString();
clearUnknown();
return result;
}
public void clearUnknown() {
unknown.delete(0, unknown.length());
}
public boolean isUnknownEmpty() {
return unknown.length() == 0;
}
@Override
public void match(String s)
throws MismatchedTokenException {
int i = 0;
while ( i<s.length() ) {
unknown.append((char)input.LA(1));
if ( input.LA(1)!=s.charAt(i) ) {
if ( state.backtracking>0 ) {
state.failed = true;
return;
}
MismatchedTokenException mte =
new MismatchedTokenException(s.charAt(i), input);
recover(mte);
throw mte;
}
i++;
input.consume();
state.failed = false;
}
// successfully matched the string
clearUnknown();
}
}
ELLIPSIS : '...';
PERIOD : '.';
QUESTION_MARK : '?';
LEFT_QUESTION_MARK : '¿';
EXCLAMATION_POINT : '!';
LEFT_EXCLAMATION_POINT : '¡';
COMMA : ',';
COLON : ':';
SEMI_COLON : ';';
MDASH : '--';
DASH : '-';
FORWARD_SLASH : '/';
QUOTATION_MARK : '"';
SINGLE_QUOTATION_MARK : '\'';
LEFT_PARENTHESIS : '(';
RIGHT_PARENTHESIS : ')';
LEFT_BRACKET : '[';
RIGHT_BRACKET : ']';
LEFT_BRACE : '{';
RIGHT_BRACE : '}';
WHITESPACE : ' ' | '\t' | '\r' | '\n';
protected
UNSPECIFIED : . { unknown.append(getText()); };parser grammar FuzzyParser;
@members {
public Sentential_FuzzyLexer lexer;
public void setLexer(Sentential_FuzzyLexer lexer) { this.lexer = lexer; }
}
whitespace : WHITESPACE+;
unspecified returns [String s]
: UNSPECIFIED+
{
$s = lexer.getUnknown();
}
;
nonterminal_punctuation
: COMMA
| COLON
| SEMI_COLON
| FORWARD_SLASH
| MDASH
| DASH
| QUOTATION_MARK
| SINGLE_QUOTATION_MARK
;
terminal_punctuation
: PERIOD
| EXCLAMATION_POINT
| QUESTION_MARK
| ELLIPSIS
;package com.infiauto.ontosem.lang.eng;
enum LongNumber {
ZERO("zero", 0, 0),
ONE("one", 0, 1),
TWO("two", 0, 2),
THREE("three", 0, 3),
FOUR("four", 0, 4),
FIVE("five", 0, 5),
SIX("six", 0, 6),
SEVEN("seven", 0, 7),
EIGHT("eight", 0, 8),
NINE("nine", 0, 9),
TEN("ten", 1, 10),
ELEVEN("eleven", 1, 11),
TWELVE("twelve", 1, 12),
THIRTEEN("thirteen", 1, 13),
FOURTEEN("fourteen", 1, 14),
FIFTEEN("fifteen", 1, 15),
SIXTEEN("sixteen", 1, 16),
SEVENTEEN("seventeen", 1, 17),
EIGHTEEN("eighteen", 1, 18),
NINTEEN("ninteen", 1, 19),
TWENTY("twenty", 1, 20),
THIRTY("thirty", 1, 30),
FORTY("forty", 1, 40),
FIFTY("fifty", 1, 50),
SIXTY("sixty", 1, 60),
SEVENTY("seventy", 1, 70),
EIGHTY("eighty", 1, 80),
NINTY("ninty", 1, 90),
HUNDRED("hundred", 2, 100),
THOUSAND("thousand", 3, 1000),
MILLION("million", 6, 1000000),
BILLION("billion", 9, 1000000000);
private String long_form;
private long power;
private long value;
private LongNumber(String long_form, long power, long value) {
this.long_form = long_form;
this.power = power;
this.value = value;
}
public String getLongForm() { return long_form; }
public long getPower() { return power; }
public long getValue() { return value; }
}grammar Sentential;
options {
output=AST;
}
import FuzzyLexer, FuzzyParser;
tokens {
MATCHED;
UNMATCHED;
SPECIAL;
NUMBER_LONG_NODE;
ORDINAL_LONG_NODE;
}
@lexer::header {
package com.infiauto.ontosem.lang.eng;
}
@parser::header {
package com.infiauto.ontosem.lang.eng;
import com.infiauto.ontosem.proc.ExtendedWordSense;
import com.infiauto.ontosem.proc.GeneratedToken;
import com.infiauto.ontosem.proc.ParseToken;
import com.infiauto.ontosem.proc.ParseTokens;
import com.infiauto.ontosem.proc.SpecialCase;
import com.infiauto.ontosem.sks.AbsoluteTime;
import com.infiauto.ontosem.sks.PartOfSpeech;
import java.math.BigInteger;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Locale;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;
}
@parser::members {
private static final Pattern SHORT_NUMBER_PATTERN
= Pattern.compile("^\\d+$");
private static final Pattern SHORT_ORDINAL_PATTERN
= Pattern.compile("^\\d+(st|nd|rd|th)$");
private static final Pattern LONG_NUMBER_PATTERN
= buildLongNumberPattern();
private static final Pattern LONG_ORDINAL_PATTERN
= buildLongOrdinalPattern();
private static final HashMap<String,LongNumber>
LONG_NUMBER_MAP = buildLongNumberMap();
private static Pattern buildLongNumberPattern() {
int capacity = 3;
// calculate the StringBuilder capacity
for(LongNumber number : LongNumber.values()) {
capacity += number.getLongForm().length() + 1;
}
StringBuilder builder = new StringBuilder(capacity);
builder.append("^(");
// build the String used for the Pattern base
for(LongNumber number : LongNumber.values()) {
if(builder.length() > 2) {
builder.append('|');
}
builder.append(number.getLongForm());
}
builder.append(")$");
return Pattern.compile(builder.toString());
}
private static Pattern buildLongOrdinalPattern() {
int capacity = 3;
// calculate the StringBuilder capacity
for(LongOrdinal ordinal : LongOrdinal.values()) {
capacity += ordinal.getLongForm().length() + 1;
}
StringBuilder builder = new StringBuilder(capacity);
builder.append("^(");
// build the String used for the Pattern base
for(LongOrdinal ordinal : LongOrdinal.values()) {
if(builder.length() > 2) {
builder.append('|');
}
builder.append(ordinal.getLongForm());
}
builder.append(")$");
return Pattern.compile(builder.toString());
}
private static HashMap<String,LongNumber>
buildLongNumberMap() {
HashMap<String,LongNumber> result = new HashMap<String,LongNumber>();
for(LongNumber number : LongNumber.values()) {
result.put(number.getLongForm(), number);
}
return result;
}
private boolean isShortNumber(String s) {
return SHORT_NUMBER_PATTERN.matcher(s).matches();
}
private boolean isShortOrdinal(String s) {
return SHORT_ORDINAL_PATTERN.matcher(s).matches();
}
private boolean isLongNumber(String s) {
Matcher matcher = LONG_NUMBER_PATTERN.matcher(s);
return matcher.matches();
}
private boolean isLongOrdinal(String s) {
Matcher matcher = LONG_ORDINAL_PATTERN.matcher(s);
return matcher.matches();
}
}
sentences : sentence+;
sentence
: handle_unspecified
| whitespace -> ^(MATCHED whitespace)
| nonterminal_punctuation -> ^(MATCHED nonterminal_punctuation)
| terminal_punctuation -> ^(MATCHED terminal_punctuation)
;
fragment
handle_unspecified
: u=unspecified
(
{ isLongNumber(u.s) }? -> ^(UNMATCHED NUMBER_LONG_NODE[u.s])
| { isLongOrdinal(u.s) }? -> ^(UNMATCHED ORDINAL_LONG_NODE[u.s])
| -> ^(UNMATCHED SPECIAL[u.s]
)
)
;
fragment
NEVER_MATCH : '""';
List: http://www.antlr.org/mailman/listinfo/antlr-interest
Unsubscribe:
http://www.antlr.org/mailman/options/antlr-interest/your-email-address
--
You received this message because you are subscribed to the Google Groups
"il-antlr-interest" group.
To post to this group, send email to [email protected].
To unsubscribe from this group, send email to
[email protected].
For more options, visit this group at
http://groups.google.com/group/il-antlr-interest?hl=en.