================ @@ -0,0 +1,189 @@ +//===-- DILLexer.cpp ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This implements the recursive descent parser for the Data Inspection +// Language (DIL), and its helper functions, which will eventually underlie the +// 'frame variable' command. The language that this parser recognizes is +// described in lldb/docs/dil-expr-lang.ebnf +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILLexer.h" +#include "lldb/Utility/Status.h" +#include "llvm/ADT/StringSwitch.h" + +namespace lldb_private { + +namespace dil { + +llvm::StringRef Token::GetTokenName(Kind kind) { + switch (kind) { + case Kind::coloncolon: + return "coloncolon"; + case Kind::eof: + return "eof"; + case Kind::identifier: + return "identifier"; + case Kind::invalid: + return "invalid"; + case Kind::kw_namespace: + return "namespace"; + case Kind::l_paren: + return "l_paren"; + case Kind::none: + return "none"; + case Kind::r_paren: + return "r_paren"; + case Kind::unknown: + return "unknown"; + } +} + +static bool IsLetter(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); +} + +static bool IsDigit(char c) { return '0' <= c && c <= '9'; } + +// A word starts with a letter, underscore, or dollar sign, followed by +// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores. +llvm::iterator_range<llvm::StringRef::iterator> DILLexer::IsWord() { + llvm::StringRef::iterator start = m_cur_pos; + bool dollar_start = false; + + // Must not start with a digit. + if (m_cur_pos == m_expr.end() || IsDigit(*m_cur_pos)) + return llvm::make_range(m_cur_pos, m_cur_pos); + + // First character *may* be a '$', for a register name or convenience + // variable. + if (*m_cur_pos == '$') { + dollar_start = true; + ++m_cur_pos; + } + + // Contains only letters, digits or underscores + for (; m_cur_pos != m_expr.end(); ++m_cur_pos) { + char c = *m_cur_pos; + if (!IsLetter(c) && !IsDigit(c) && c != '_') + break; + } + + // If first char is '$', make sure there's at least one mare char, or it's + // invalid. + if (dollar_start && (m_cur_pos - start <= 1)) { + m_cur_pos = start; + return llvm::make_range(start, start); // Empty range + } + + return llvm::make_range(start, m_cur_pos); +} + +void DILLexer::UpdateLexedTokens(Token &result, Token::Kind tok_kind, + std::string tok_str, uint32_t tok_pos) { + Token new_token(tok_kind, tok_str, tok_pos); + result = new_token; + m_lexed_tokens.push_back(std::move(new_token)); +} + +llvm::Expected<bool> DILLexer::LexAll() { + bool done = false; + while (!done) { + auto tok_or_err = Lex(); + if (!tok_or_err) + return tok_or_err.takeError(); + Token token = *tok_or_err; + if (token.GetKind() == Token::eof) { + done = true; + } + } + return true; +} + +llvm::Expected<Token> DILLexer::Lex() { + Token result; + + // Skip over whitespace (spaces). + while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ') + m_cur_pos++; + + // Check to see if we've reached the end of our input string. + if (m_cur_pos == m_expr.end()) { + UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size()); + return result; + } + + uint32_t position = m_cur_pos - m_expr.begin(); + llvm::StringRef::iterator start = m_cur_pos; + llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord(); + if (!word_range.empty()) { + uint32_t length = word_range.end() - word_range.begin(); + llvm::StringRef word(m_expr.substr(position, length)); + // We will be adding more keywords here in the future... + Token::Kind kind = llvm::StringSwitch<Token::Kind>(word) + .Case("namespace", Token::kw_namespace) + .Default(Token::identifier); + UpdateLexedTokens(result, kind, word.str(), position); + return result; + } + + m_cur_pos = start; + llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos)); + std::vector<std::pair<Token::Kind, const char *>> operators = { + {Token::l_paren, "("}, + {Token::r_paren, ")"}, + {Token::coloncolon, "::"}, + }; + for (auto [kind, str] : operators) { + if (remainder.consume_front(str)) { + m_cur_pos += strlen(str); + UpdateLexedTokens(result, kind, str, position); + return result; + } + } + + // Unrecognized character(s) in string; unable to lex it. + Status error("Unable to lex input string"); + return error.ToError(); +} ---------------- labath wrote:
Sorry for rewriting this for you, but I figured its easier than explaining everything in abstract: The main things I wanted to achieve by this are: - no half-initialized state (object constructed, but LexAll not called). The object is always constructed fully parsed. It's basically what's described [here](https://llvm.org/docs/ProgrammersManual.html#fallible-constructors), but even better because there isn't even a privately-visible half-initialized state. (Since the only state of the is basically "the remainder of the string", I figured it's easier to pass it as arguments and construct the lexer only when it's done. This also lets us get rid of the m_cur_pos`` member which is only used in the initialization stage. - I doubled down on the StringRef representation. I see you've partially used it, but that still meant that there were some awkward conversions between position-in-the-string and StringRef representations. Now they're gone. I also realized that `iterator_range<StringRef::iterator>` is just an (unnecessarily) fancy name for `StringRef`, so I just use that throughout. - no more `UpdateLexedTokens`. Just using `Token` as a value type. The overall programming style is also more functional - less side effects, more return values The thing I did not do (but I still think it would be better is to replace the `std::vector<std::pair<>>` keyword representation with the "constexpr array of pairs" I had in my original suggestion. I think that's better because the vector thing means you'll be constructing a new vector object every time you call this function. That's going to impact the performance more (although it will still probably be unnoticeable) than any StringSwitch usage, as it causes a memory allocation. If you think the use of a C array is obsolete, you can also use a `constexpr std::initializer_list<std::pair<>>`, but I find that just adds an unnecessary level of boilerplate. ```suggestion llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) { std::vector<Token> tokens; llvm::StringRef remainder = expr; do { if (llvm::Expected<Token> t = Lex(expr, remainder)) tokens.push_back(std:move(*t); else return t.takeError(); } while (tokens.back().GetKind() != Token::eof); return DILLexer(std::move(tokens)); // calling a private constructor } static llvm::Expected<Token> Lex(llvm::StringRef expr, llvm::StringRef &remainder) { // Skip over whitespace. remainder = remainder.ltrim(); size_t position = remainder.data()-expr.data(); // Check to see if we've reached the end of our input string. if (remainder.empty()) return Token(Token::eof, if (m_cur_pos == m_expr.end()) return Token(Token::eof, "", position); llvm::StringRef word = IsWord(remainder); // automatically updates `remainder`, you may be able to use things like `StringRef::drop_while` in the implementation if (!word_range.empty()) { // We will be adding more keywords here in the future... Token::Kind kind = llvm::StringSwitch<Token::Kind>(word) .Case("namespace", Token::kw_namespace) .Default(Token::identifier); return Token(kind, word.str(), position); } std::vector<std::pair<Token::Kind, const char *>> operators = { {Token::l_paren, "("}, {Token::r_paren, ")"}, {Token::coloncolon, "::"}, }; for (auto [kind, str] : operators) { if (remainder.consume_front(str)) return Token(kind, str, position); } return llvm::createStringError("Unable to lex input string"); } ``` https://github.com/llvm/llvm-project/pull/123521 _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits