Re: Recursive-descent parsing

Stefan Koch via Digitalmars-d-learn Sat, 24 Dec 2016 06:16:19 -0800

On Saturday, 24 December 2016 at 12:42:31 UTC, Andrew Edwardswrote:

The authors of "The Art of Java" present, as a first codingexample, a recursive-descent parser to demonstrate Java'sability to facilitate low level programming commonly performedin C and C++.

I took the opportunity to port the code to D. By doing this, Inow have an understanding of how a recursive-descent parser iswritten and works. What I am now interested in, is learningabout what makes this particular implementation a poor orstrong one, how to improve upon it and other, possiblysuperior, implementations made possible by D.


Comments are invited and appreciated.

Andrew Edwards
If at first you don't succeed...

=========================
rdp2.d
=========================
//module rdp2;

class ParserException : Throwable
{
    this(string msg)
    {
        super(msg);
    }
}

// These are the token types.
enum
{
    NONE,
    DELIMITER,
    VARIABLE,
    NUMBER
}

// These are the types of syntax errors.
enum
{
    SYNTAX,
    UNBALPARENS,
    NOEXP,
    DIVBYZERO
}

// This token indecates end-of-expression.
enum EOE = ";";

string exp;     // refers to expression string
int ndx;     // current index into the expression
string token;   // holds current token
int tokType;    // holds token's types

// Array for variables.
double[] vars = new double[26];

// Parser entry poing.
double evaluate(string expstr)
{
    double result;
    exp = expstr;
    ndx = 0;

    getToken();
    if (token == EOE)
        handleErr(NOEXP); // no expression present

    // Parse and evaluate the expression.
    result = evalExp1();

    if (token != EOE)
        handleErr(SYNTAX); // last token must be EOE

    return result;
}

// Process an assignment.
double evalExp1()
{
    double result;
    int varNdx;
    int ttokType;
    string tmpToken;

    if (tokType == VARIABLE)
    {
        // save old token
        tmpToken = token;
        ttokType = tokType;

        // Compute the index of the variable.
        import std.ascii: toUpper;
        varNdx = token[0].toUpper - 'A';

        getToken();
        if (token != "=")
        {
            putBack(); // return current token
            // restore old token -- not an assignment
            token = tmpToken;
            tokType = ttokType;
        }
        else
        {
            getToken(); // get next part of exp
            result = evalExp2();
            vars[varNdx] = result;
            return result;
        }
    }
    return evalExp2();
}

// Add or subtract two terms
double evalExp2()
{
    char op;
    double result;
    double partialResult;

    result = evalExp3();

    while ((op = token[0]) == '+' || op == '-')
    {
        getToken();
        partialResult = evalExp3();
        final switch (op)
        {
            case '-':
            {
                result -= partialResult;
                break;
            }
            case '+':
            {
                result += partialResult;
                break;
            }
        }
    }
    return result;
}

// Multiply or divide two factors
double evalExp3()
{
    char op;
    double result;
    double partialResult;

    result = evalExp4();

    while ((op = token[0]) == '*' || op == '/' || op == '%')
    {
        getToken();
        partialResult = evalExp4();
        final switch (op)
        {
            case '*':
            {
                result *= partialResult;
                break;
            }
            case '/':
            {
                if (partialResult == 0.0)
                    handleErr(DIVBYZERO);
                result /= partialResult;
                break;
            }
            case '%':
            {
                if (partialResult == 0.0)
                    handleErr(DIVBYZERO);
                result %= partialResult;
                break;
            }
        }
    }
    return result;
}

// Process an exponent.
double evalExp4()
{
    double result;
    double partialResult;
    double ex;

    result = evalExp5();

    if (token == "^")
    {
        getToken();
        partialResult = evalExp4();
        ex = result;
        if (partialResult == 0.0)
        {
            result = 1.0;
        }
        else
        {
            for (int t = cast(int)partialResult-1; t > 0; t--)
                result *= ex;
        }
    }
    return result;
}

// Evaluate a unary + or -
double evalExp5()
{
    double result;
    string op;

    op = null;
    if ((tokType == DELIMITER) && token == "+" || token == "-")
    {

        op = token;
        getToken();
    }
    result = evalExp6();

    //if (op == "-") result = -result;

    return (op == "-") ? -result : result;
}

// Process a parenthesized expression.
double evalExp6()
{
    double result;

    if (token == "(")
    {
        getToken();
        result = evalExp2();
        if (token != ")")
            handleErr(UNBALPARENS);
        getToken();
    }
    else
        result = atom();

    return result;
}

// Get the value of a number.
double atom()
{
    double result = 0.0;

    switch (tokType)
    {
        case NUMBER:
            try
            {
                import std.conv: parse;
                result = token.parse!double;
            }
            catch (Exception e)
            {
                handleErr(SYNTAX);
            }
            getToken();
            break;
        case VARIABLE:
            result = findVar(token);
            getToken();
            break;
        default:
            handleErr(SYNTAX);
            break;
    }
    return result;
}

// Return the value of a variable.
double findVar(string vname)
{
    import std.ascii: isAlpha, toUpper;
    if (!vname[0].isAlpha)
    {
        handleErr(SYNTAX);
        return 0.0;
    }

    return vars[(vname[0] - 'A').toUpper];
}

// Return a token to the input stream.
void putBack()
{
    if (token == EOE) return;
    foreach(i; 0 .. token.length) ndx--;
}

// Handle an error.
void handleErr(int error)
{
    string[] err = [
        "Syntax Error",
        "Unbalanced Parentheses",
        "No Expression Present",
        "Division by Zero"
    ];

    throw new ParserException(err[error]);
}

// Obtain the next token.
void getToken()
{
    import std.ascii: isAlpha, isDigit, isWhite;
    tokType = NONE;
    token = null;

    // Check for end of expression
    if (ndx == exp.length)
    {
        token = EOE.dup;
        return;
    }

    // Skip over white space
    while (ndx < exp.length && exp[ndx].isWhite) ++ndx;

    // Trailing whitespace ends expression.
    if (ndx == exp.length)
    {
        token = EOE.dup;
        return;
    }

    if (exp[ndx].isDelim) // is operator
    {
        token ~= exp[ndx];
        ndx++;
        tokType = DELIMITER;
    }
    else if (exp[ndx].isAlpha) // is variable
    {
        while (!exp[ndx].isDelim)
        {
            token ~= exp[ndx];
            ndx++;
            if (ndx >= exp.length) break;
        }
        tokType = VARIABLE;
    }
    else if (exp[ndx].isDigit) // is number
    {
        while (!exp[ndx].isDelim)
        {
            token ~= exp[ndx];
            ndx++;
            if (ndx >= exp.length) break;

        }
        tokType = NUMBER;
    }
    else // unknown character terminates expression
    {
        token = EOE.dup;
        return;
    }
}

// Returns true if c is a delimiter.
bool isDelim(char c)
{
    import std.algorithm: canFind;
    return canFind(" +-/*%^=()", c);
}

=========================
demordp.d
=========================
mixin(import("rdp2.d"));

void main(string[] args)
{
    string expr;

    import std.stdio: write, writeln, readln;
    writeln("Enter an empty expression to stop.");

    for(;;)
    {
        write("Enter an expression: ");
        expr = readln();
        if (expr == "\n") break;
        try
        {
            writeln("Result: ", expr.evaluate);
        }
        catch (ParserException e)
        {
            e.msg.writeln;
        }
    }
}

All of the performance characteristics crucially depend on thelanguage you want to parse.In general you want to match as many tokes at as you can in asingle parse-tree node.Since that will reduce the number of function calls you have tomake.

You want to preallocate buffers.

Use the append-functionality sparingly since that involves a callinto a shared library.Avoid or encapsulate global state if you can since it can gettricky to mange.

Other then that there is not much general advice I can give.

Try to tag your parse-nodes since then you can match patternsusing a nested switch, which is one of the cleanest routes to doit.

Re: Recursive-descent parsing

Reply via email to