// Copyright 2018 the V8 project authors. All rights reserved.
|
// Use of this source code is governed by a BSD-style license that can be
|
// found in the LICENSE file.
|
|
#include <algorithm>
|
#include <set>
|
#include <unordered_map>
|
#include <unordered_set>
|
|
#include "src/torque/ast.h"
|
#include "src/torque/earley-parser.h"
|
#include "src/torque/utils.h"
|
|
namespace v8 {
|
namespace internal {
|
namespace torque {
|
|
namespace {
|
|
void UpdateSourcePosition(InputPosition from, InputPosition to,
|
SourcePosition* pos) {
|
while (from != to) {
|
if (*from == '\n') {
|
pos->line += 1;
|
pos->column = 0;
|
} else {
|
pos->column += 1;
|
}
|
++from;
|
}
|
}
|
|
} // namespace
|
|
base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
|
const LexerResult& tokens) const {
|
std::vector<ParseResult> results;
|
for (const Item* child : completed_item->Children()) {
|
if (!child) continue;
|
base::Optional<ParseResult> child_result =
|
child->left()->RunAction(child, tokens);
|
if (child_result) results.push_back(std::move(*child_result));
|
}
|
MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
|
CurrentSourcePosition::Scope pos_scope(matched_input.pos);
|
ParseResultIterator iterator(std::move(results), matched_input);
|
return action_(&iterator);
|
}
|
|
Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
|
rules_.clear();
|
for (const Rule& rule : rules) {
|
AddRule(rule);
|
}
|
return *this;
|
}
|
|
std::vector<const Item*> Item::Children() const {
|
std::vector<const Item*> children;
|
for (const Item* current = this; current->prev_; current = current->prev_) {
|
children.push_back(current->child_);
|
}
|
// The above loop collects the child nodes in reversed order.
|
std::reverse(children.begin(), children.end());
|
DCHECK_EQ(children.size(), right().size());
|
return children;
|
}
|
|
std::string Item::SplitByChildren(const LexerResult& tokens) const {
|
if (right().size() == 1) {
|
if (const Item* child = Children()[0])
|
return child->SplitByChildren(tokens);
|
}
|
std::stringstream s;
|
bool first = true;
|
for (const Item* item : Children()) {
|
if (!item) continue;
|
if (!first) s << " ";
|
s << item->GetMatchedInput(tokens).ToString();
|
first = false;
|
}
|
return s.str();
|
}
|
|
void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
|
DCHECK(*this == other);
|
if (child_ != other.child_) {
|
std::stringstream s;
|
s << "Ambiguous grammer rules for \""
|
<< child_->GetMatchedInput(tokens).ToString() << "\":\n "
|
<< child_->SplitByChildren(tokens) << "\nvs\n "
|
<< other.child_->SplitByChildren(tokens);
|
ReportError(s.str());
|
}
|
if (prev_ != other.prev_) {
|
std::stringstream s;
|
s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
|
<< "\":\n " << SplitByChildren(tokens) << " ...\nvs\n "
|
<< other.SplitByChildren(tokens) << " ...";
|
ReportError(s.str());
|
}
|
}
|
|
LexerResult Lexer::RunLexer(const std::string& input) {
|
LexerResult result;
|
InputPosition const begin = input.c_str();
|
InputPosition const end = begin + input.size();
|
InputPosition pos = begin;
|
InputPosition token_start = pos;
|
CurrentSourcePosition::Scope scope(
|
SourcePosition{CurrentSourceFile::Get(), 0, 0});
|
match_whitespace_(&pos);
|
while (pos != end) {
|
UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
|
token_start = pos;
|
Symbol* symbol = MatchToken(&pos, end);
|
if (!symbol) {
|
ReportError("Lexer Error: unknown token " +
|
StringLiteralQuote(std::string(
|
token_start, token_start + std::min<ptrdiff_t>(
|
end - token_start, 10))));
|
}
|
result.token_symbols.push_back(symbol);
|
result.token_contents.push_back(
|
{token_start, pos, CurrentSourcePosition::Get()});
|
match_whitespace_(&pos);
|
}
|
UpdateSourcePosition(token_start, pos, &CurrentSourcePosition::Get());
|
// Add an additional token position to simplify corner cases.
|
result.token_contents.push_back({pos, pos, CurrentSourcePosition::Get()});
|
return result;
|
}
|
|
Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
|
InputPosition token_start = *pos;
|
Symbol* symbol = nullptr;
|
// Find longest matching pattern.
|
for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
|
InputPosition token_end = token_start;
|
PatternFunction matchPattern = pair.first;
|
if (matchPattern(&token_end) && token_end > *pos) {
|
*pos = token_end;
|
symbol = &pair.second;
|
}
|
}
|
// Check if matched pattern coincides with a keyword. Prefer the keyword in
|
// this case.
|
if (*pos != token_start) {
|
auto found_keyword = keywords_.find(std::string(token_start, *pos));
|
if (found_keyword != keywords_.end()) {
|
return &found_keyword->second;
|
}
|
return symbol;
|
}
|
// Now check for a keyword (that doesn't overlap with a pattern).
|
// Iterate from the end to ensure that if one keyword is a prefix of another,
|
// we first try to match the longer one.
|
for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
|
const std::string& keyword = it->first;
|
if (static_cast<size_t>(end - *pos) < keyword.size()) continue;
|
if (keyword == std::string(*pos, *pos + keyword.size())) {
|
*pos += keyword.size();
|
return &it->second;
|
}
|
}
|
return nullptr;
|
}
|
|
// This is an implementation of Earley's parsing algorithm
|
// (https://en.wikipedia.org/wiki/Earley_parser).
|
const Item* RunEarleyAlgorithm(
|
Symbol* start, const LexerResult& tokens,
|
std::unordered_set<Item, base::hash<Item>>* processed) {
|
// Worklist for items at the current position.
|
std::vector<Item> worklist;
|
// Worklist for items at the next position.
|
std::vector<Item> future_items;
|
CurrentSourcePosition::Scope source_position(
|
SourcePosition{CurrentSourceFile::Get(), 0, 0});
|
std::vector<const Item*> completed_items;
|
std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
|
base::hash<std::pair<size_t, Symbol*>>>
|
waiting;
|
|
std::vector<const Item*> debug_trace;
|
|
// Start with one top_level symbol mapping to the start symbol of the grammar.
|
// This simplifies things because the start symbol might have several
|
// rules.
|
Symbol top_level;
|
top_level.AddRule(Rule({start}));
|
worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
|
|
size_t input_length = tokens.token_symbols.size();
|
|
for (size_t pos = 0; pos <= input_length; ++pos) {
|
while (!worklist.empty()) {
|
auto insert_result = processed->insert(worklist.back());
|
const Item& item = *insert_result.first;
|
DCHECK_EQ(pos, item.pos());
|
MatchedInput last_token = tokens.token_contents[pos];
|
CurrentSourcePosition::Get() = last_token.pos;
|
bool is_new = insert_result.second;
|
if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
|
worklist.pop_back();
|
if (!is_new) continue;
|
|
debug_trace.push_back(&item);
|
if (item.IsComplete()) {
|
// 'Complete' phase: Advance all items that were waiting to match this
|
// symbol next.
|
for (const Item* parent : waiting[{item.start(), item.left()}]) {
|
worklist.push_back(parent->Advance(pos, &item));
|
}
|
} else {
|
Symbol* next = item.NextSymbol();
|
// 'Scan' phase: Check if {next} is the next symbol in the input (this
|
// is never the case if {next} is a non-terminal).
|
if (pos < tokens.token_symbols.size() &&
|
tokens.token_symbols[pos] == next) {
|
future_items.push_back(item.Advance(pos + 1, nullptr));
|
}
|
// 'Predict' phase: Add items for every rule of the non-terminal.
|
if (!next->IsTerminal()) {
|
// Remember that this item is waiting for completion with {next}.
|
waiting[{pos, next}].insert(&item);
|
}
|
for (size_t i = 0; i < next->rule_number(); ++i) {
|
Rule* rule = next->rule(i);
|
auto already_completed =
|
processed->find(Item{rule, rule->right().size(), pos, pos});
|
// As discussed in section 3 of
|
// Aycock, John, and R. Nigel Horspool. "Practical earley
|
// parsing." The Computer Journal 45.6 (2002): 620-630.
|
// Earley parsing has the following problem with epsilon rules:
|
// When we complete an item that started at the current position
|
// (that is, it matched zero tokens), we might not yet have
|
// predicted all items it can complete with. Thus we check for the
|
// existence of such items here and complete them immediately.
|
if (already_completed != processed->end()) {
|
worklist.push_back(item.Advance(pos, &*already_completed));
|
} else {
|
worklist.push_back(Item{rule, 0, pos, pos});
|
}
|
}
|
}
|
}
|
std::swap(worklist, future_items);
|
}
|
|
auto final_item =
|
processed->find(Item{top_level.rule(0), 1, 0, input_length});
|
if (final_item != processed->end()) {
|
// Success: The {top_level} rule matches the complete input.
|
return final_item->Children()[0];
|
}
|
std::string reason;
|
const Item& last_item = *debug_trace.back();
|
if (last_item.pos() < tokens.token_symbols.size()) {
|
std::string next_token = tokens.token_contents[last_item.pos()].ToString();
|
reason = "unexpected token \"" + next_token + "\"";
|
} else {
|
reason = "unexpected end of input";
|
}
|
ReportError("Parser Error: " + reason);
|
}
|
|
// static
|
bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
|
if (**pos && char_class(static_cast<unsigned char>(**pos))) {
|
++*pos;
|
return true;
|
}
|
return false;
|
}
|
|
// static
|
bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
|
if (**pos && char_class(**pos)) {
|
++*pos;
|
return true;
|
}
|
return false;
|
}
|
|
// static
|
bool Grammar::MatchString(const char* s, InputPosition* pos) {
|
InputPosition current = *pos;
|
for (; *s != 0; ++s, ++current) {
|
if (*s != *current) return false;
|
}
|
*pos = current;
|
return true;
|
}
|
|
// static
|
bool Grammar::MatchAnyChar(InputPosition* pos) {
|
return MatchChar([](char c) { return true; }, pos);
|
}
|
|
} // namespace torque
|
} // namespace internal
|
} // namespace v8
|