On Saturday, 24 December 2016 at 12:42:31 UTC, Andrew Edwards wrote:
The authors of "The Art of Java" present, as a first coding example, a recursive-descent parser to demonstrate Java's ability to facilitate low level programming commonly performed in C and C++.

I took the opportunity to port the code to D. By doing this, I now have an understanding of how a recursive-descent parser is written and works. What I am now interested in, is learning about what makes this particular implementation a poor or strong one, how to improve upon it and other, possibly superior, implementations made possible by D.

Comments are invited and appreciated.

Andrew Edwards
If at first you don't succeed...

=========================
rdp2.d
=========================
//module rdp2;

class ParserException : Throwable
{
    this(string msg)
    {
        super(msg);
    }
}

// These are the token types.
enum
{
    NONE,
    DELIMITER,
    VARIABLE,
    NUMBER
}

// These are the types of syntax errors.
enum
{
    SYNTAX,
    UNBALPARENS,
    NOEXP,
    DIVBYZERO
}

// This token indecates end-of-expression.
enum EOE = ";";

string exp;     // refers to expression string
int ndx;     // current index into the expression
string token;   // holds current token
int tokType;    // holds token's types

// Array for variables.
double[] vars = new double[26];

// Parser entry poing.
double evaluate(string expstr)
{
    double result;
    exp = expstr;
    ndx = 0;

    getToken();
    if (token == EOE)
        handleErr(NOEXP); // no expression present

    // Parse and evaluate the expression.
    result = evalExp1();

    if (token != EOE)
        handleErr(SYNTAX); // last token must be EOE

    return result;
}

// Process an assignment.
double evalExp1()
{
    double result;
    int varNdx;
    int ttokType;
    string tmpToken;

    if (tokType == VARIABLE)
    {
        // save old token
        tmpToken = token;
        ttokType = tokType;

        // Compute the index of the variable.
        import std.ascii: toUpper;
        varNdx = token[0].toUpper - 'A';

        getToken();
        if (token != "=")
        {
            putBack(); // return current token
            // restore old token -- not an assignment
            token = tmpToken;
            tokType = ttokType;
        }
        else
        {
            getToken(); // get next part of exp
            result = evalExp2();
            vars[varNdx] = result;
            return result;
        }
    }
    return evalExp2();
}

// Add or subtract two terms
double evalExp2()
{
    char op;
    double result;
    double partialResult;

    result = evalExp3();

    while ((op = token[0]) == '+' || op == '-')
    {
        getToken();
        partialResult = evalExp3();
        final switch (op)
        {
            case '-':
            {
                result -= partialResult;
                break;
            }
            case '+':
            {
                result += partialResult;
                break;
            }
        }
    }
    return result;
}

// Multiply or divide two factors
double evalExp3()
{
    char op;
    double result;
    double partialResult;

    result = evalExp4();

    while ((op = token[0]) == '*' || op == '/' || op == '%')
    {
        getToken();
        partialResult = evalExp4();
        final switch (op)
        {
            case '*':
            {
                result *= partialResult;
                break;
            }
            case '/':
            {
                if (partialResult == 0.0)
                    handleErr(DIVBYZERO);
                result /= partialResult;
                break;
            }
            case '%':
            {
                if (partialResult == 0.0)
                    handleErr(DIVBYZERO);
                result %= partialResult;
                break;
            }
        }
    }
    return result;
}

// Process an exponent.
double evalExp4()
{
    double result;
    double partialResult;
    double ex;

    result = evalExp5();

    if (token == "^")
    {
        getToken();
        partialResult = evalExp4();
        ex = result;
        if (partialResult == 0.0)
        {
            result = 1.0;
        }
        else
        {
            for (int t = cast(int)partialResult-1; t > 0; t--)
                result *= ex;
        }
    }
    return result;
}

// Evaluate a unary + or -
double evalExp5()
{
    double result;
    string op;

    op = null;
    if ((tokType == DELIMITER) && token == "+" || token == "-")
    {

        op = token;
        getToken();
    }
    result = evalExp6();

    //if (op == "-") result = -result;

    return (op == "-") ? -result : result;
}

// Process a parenthesized expression.
double evalExp6()
{
    double result;

    if (token == "(")
    {
        getToken();
        result = evalExp2();
        if (token != ")")
            handleErr(UNBALPARENS);
        getToken();
    }
    else
        result = atom();

    return result;
}

// Get the value of a number.
double atom()
{
    double result = 0.0;

    switch (tokType)
    {
        case NUMBER:
            try
            {
                import std.conv: parse;
                result = token.parse!double;
            }
            catch (Exception e)
            {
                handleErr(SYNTAX);
            }
            getToken();
            break;
        case VARIABLE:
            result = findVar(token);
            getToken();
            break;
        default:
            handleErr(SYNTAX);
            break;
    }
    return result;
}

// Return the value of a variable.
double findVar(string vname)
{
    import std.ascii: isAlpha, toUpper;
    if (!vname[0].isAlpha)
    {
        handleErr(SYNTAX);
        return 0.0;
    }

    return vars[(vname[0] - 'A').toUpper];
}

// Return a token to the input stream.
void putBack()
{
    if (token == EOE) return;
    foreach(i; 0 .. token.length) ndx--;
}

// Handle an error.
void handleErr(int error)
{
    string[] err = [
        "Syntax Error",
        "Unbalanced Parentheses",
        "No Expression Present",
        "Division by Zero"
    ];

    throw new ParserException(err[error]);
}

// Obtain the next token.
void getToken()
{
    import std.ascii: isAlpha, isDigit, isWhite;
    tokType = NONE;
    token = null;

    // Check for end of expression
    if (ndx == exp.length)
    {
        token = EOE.dup;
        return;
    }

    // Skip over white space
    while (ndx < exp.length && exp[ndx].isWhite) ++ndx;

    // Trailing whitespace ends expression.
    if (ndx == exp.length)
    {
        token = EOE.dup;
        return;
    }

    if (exp[ndx].isDelim) // is operator
    {
        token ~= exp[ndx];
        ndx++;
        tokType = DELIMITER;
    }
    else if (exp[ndx].isAlpha) // is variable
    {
        while (!exp[ndx].isDelim)
        {
            token ~= exp[ndx];
            ndx++;
            if (ndx >= exp.length) break;
        }
        tokType = VARIABLE;
    }
    else if (exp[ndx].isDigit) // is number
    {
        while (!exp[ndx].isDelim)
        {
            token ~= exp[ndx];
            ndx++;
            if (ndx >= exp.length) break;

        }
        tokType = NUMBER;
    }
    else // unknown character terminates expression
    {
        token = EOE.dup;
        return;
    }
}

// Returns true if c is a delimiter.
bool isDelim(char c)
{
    import std.algorithm: canFind;
    return canFind(" +-/*%^=()", c);
}

=========================
demordp.d
=========================
mixin(import("rdp2.d"));

void main(string[] args)
{
    string expr;

    import std.stdio: write, writeln, readln;
    writeln("Enter an empty expression to stop.");

    for(;;)
    {
        write("Enter an expression: ");
        expr = readln();
        if (expr == "\n") break;
        try
        {
            writeln("Result: ", expr.evaluate);
        }
        catch (ParserException e)
        {
            e.msg.writeln;
        }
    }
}

All of the performance characteristics crucially depend on the language you want to parse. In general you want to match as many tokes at as you can in a single parse-tree node. Since that will reduce the number of function calls you have to make.
You want to preallocate buffers.
Use the append-functionality sparingly since that involves a call into a shared library. Avoid or encapsulate global state if you can since it can get tricky to mange.
Other then that there is not much general advice I can give.

Try to tag your parse-nodes since then you can match patterns using a nested switch, which is one of the cleanest routes to do it.

  • Recursive-descent parsing Andrew Edwards via Digitalmars-d-learn
    • Re: Recursive-descent parsing Stefan Koch via Digitalmars-d-learn

Reply via email to