diff --git a/.gitmodules b/.gitmodules index 44aee30a..94154e94 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ path = external/throwing_ptr url = https://github.com/rockdreamer/throwing_ptr ignore = dirty +[submodule "external/lexertl17"] + path = external/lexertl17 + url = https://github.com/BenHanson/lexertl17.git diff --git a/external/lexertl17 b/external/lexertl17 new file mode 160000 index 00000000..5507eedb --- /dev/null +++ b/external/lexertl17 @@ -0,0 +1 @@ +Subproject commit 5507eedbfcc2273285603f70fdc4cfc99e57be59 diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 29cbf1e6..5a1874cf 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -11,6 +11,22 @@ else () message(STATUS "libpl static library is being created") endif () +add_subdirectory(lexer_gen) + +set(PL_STATICLEXERPATH "${CMAKE_CURRENT_SOURCE_DIR}/include/pl/core/generated/lexer_static.hpp") + +add_custom_command( + OUTPUT ${PL_STATICLEXERPATH} + COMMAND $ ${PL_STATICLEXERPATH} + DEPENDS lexer_gen + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Generating lexertl17 static lexer" +) + +add_custom_target(run_lexer_gen + DEPENDS ${PL_STATICLEXERPATH} +) + add_library(libpl ${LIBRARY_TYPE} source/pl/helpers/utils.cpp @@ -52,7 +68,11 @@ add_library(libpl ${LIBRARY_TYPE} source/pl/core/token.cpp source/pl/core/evaluator.cpp + source/pl/core/lexer.cpp + $<$>:${PL_STATICLEXERPATH}> + $<$:source/pl/core/lexer_sm.cpp> + source/pl/core/parser.cpp source/pl/core/preprocessor.cpp source/pl/core/validator.cpp @@ -102,6 +122,7 @@ endif () target_include_directories(libpl PUBLIC include ../external/throwing_ptr/include) target_include_directories(libpl_includes INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include ../external/throwing_ptr/include) +target_include_directories(libpl PRIVATE include ../external/lexertl17/include) target_link_libraries(libpl PRIVATE ${FMT_LIBRARIES}) target_link_libraries(libpl PUBLIC wolv::types wolv::io wolv::utils wolv::hash wolv::containers) diff --git a/lib/include/pl/core/.gitignore b/lib/include/pl/core/.gitignore new file mode 100644 index 00000000..da604898 --- /dev/null +++ b/lib/include/pl/core/.gitignore @@ -0,0 +1,2 @@ +# Ignore generated code folder +generated/ \ No newline at end of file diff --git a/lib/include/pl/core/errors/error.hpp b/lib/include/pl/core/errors/error.hpp index edbc5073..b1be74fa 100644 --- a/lib/include/pl/core/errors/error.hpp +++ b/lib/include/pl/core/errors/error.hpp @@ -130,33 +130,27 @@ namespace pl::core::err { std::vector m_trace; }; - class ErrorCollector { + class ErrorCollectorExplicitLocation + { public: - - virtual ~ErrorCollector() = default; - - virtual Location location() = 0; + virtual ~ErrorCollectorExplicitLocation() = default; template - void error(const fmt::format_string& fmt, Args&&... args) { - this->m_errors.emplace_back(fmt::format(fmt, std::forward(args)...), location()); + void error(const Location &location, const fmt::format_string &fmt, Args&&... args) { + this->m_errors.emplace_back(fmt::format(fmt, std::forward(args)...), location); } - void error(const std::string &message) { - this->m_errors.emplace_back(message, location()); - } - - void errorDesc(const std::string &message, const std::string &description) { - this->m_errors.emplace_back(message, description, location()); + void errorDesc(const Location &location, const std::string &message, const std::string &description) { + this->m_errors.emplace_back(message, description, location); } template - void errorDesc(const fmt::format_string& message, const std::string &description, Args&&... args) { - this->m_errors.emplace_back(fmt::format(message, std::forward(args)...), description, location()); + void errorDesc(const Location &location, const fmt::format_string& message, const std::string &description, Args&&... args) { + this->m_errors.emplace_back(fmt::format(message, std::forward(args)...), description, location); } - void error(CompileError& error) { - error.getTrace().push_back(location()); + void error(const Location &location, CompileError& error) { + error.getTrace().push_back(location); this->m_errors.push_back(std::move(error)); } @@ -188,8 +182,39 @@ namespace pl::core::err { void clear() { this->m_errors.clear(); } + private: std::vector m_errors; }; + class ErrorCollector : public ErrorCollectorExplicitLocation { + public: + + virtual ~ErrorCollector() = default; + + virtual Location location() = 0; + + template + void error(const fmt::format_string &fmt, Args&&... args) { + this->ErrorCollectorExplicitLocation::error(location(), fmt, std::forward(args)...); + } + + void error(const std::string &message) { + this->errorAt(location(), message); + } + + void errorDesc(const std::string &message, const std::string &description) { + this->ErrorCollectorExplicitLocation::errorDesc(location(), message, description); + } + + template + void errorDesc(const fmt::format_string& message, const std::string &description, Args&&... args) { + this->ErrorCollectorExplicitLocation::errorDesc(location(), message, description, std::forward(args)...); + } + + void error(CompileError& error) { + this->ErrorCollectorExplicitLocation::error(location(), error); + } + }; + } \ No newline at end of file diff --git a/lib/include/pl/core/lexer.hpp b/lib/include/pl/core/lexer.hpp index 4a9f2aa5..2028735b 100644 --- a/lib/include/pl/core/lexer.hpp +++ b/lib/include/pl/core/lexer.hpp @@ -1,72 +1,34 @@ +// "Lexer.hpp" #pragma once -#include -#include - -#include - -#include - +#include #include -#include #include - +#include +#include #include +// Debugging +#include namespace pl::core { - class Lexer : err::ErrorCollector { + class Lexer : err::ErrorCollectorExplicitLocation { public: - Lexer() = default; + Lexer(); + + void reset(); hlp::CompileResult> lex(const api::Source *source); size_t getLongestLineLength() const { return m_longestLineLength; } - void reset(); private: - [[nodiscard]] char peek(size_t p = 1) const; - bool processToken(auto parserFunction, const std::string_view& identifier); - Location location() override; - - std::optional parseCharacter(); - std::optional parseOperator(); - std::optional parseSeparator(); - std::optional parseOneLineComment(); - std::optional parseOneLineDocComment(); - std::optional parseMultiLineComment(); - std::optional parseMultiLineDocComment(); - std::optional parseKeyword(const std::string_view &identifier); - std::optional parseType(const std::string_view &identifier); - std::optional parseDirectiveName(const std::string_view &identifier); - std::optional parseNamedOperator(const std::string_view &identifier); - std::optional parseConstant(const std::string_view &identifier); - std::optional parseStringLiteral(); - std::optional parseDirectiveArgument(); - std::optional parseDirectiveValue(); - std::optional parseIntegerLiteral(std::string_view literal); + std::optional parseInteger(std::string_view literal, const auto &location); + std::optional parseFloatingPoint(std::string_view literal, const char suffix, const auto &location); + std::optional parseCharacter(const char* &pchar, const char* e, const auto &location); + std::optional parseStringLiteral(std::string_view literal, const auto &location); - std::optional parseFloatingPoint(std::string_view literal, char suffix); - std::optional parseInteger(std::string_view literal); - - Token makeToken(const Token& token, size_t length = 1); - static Token makeTokenAt(const Token& token, Location& location, size_t length = 1); - void addToken(const Token& token); - bool hasTheLineEnded(const char &ch) { - if(ch == '\n') { - m_longestLineLength = std::max(m_longestLineLength, m_cursor - m_lineBegin); - m_line++; - m_lineBegin = m_cursor; - return true; - } - return false; - } - std::string m_sourceCode; - const api::Source* m_source = nullptr; std::vector m_tokens; - size_t m_cursor = 0; - u32 m_line = 0; - u32 m_lineBegin = 0; - size_t m_longestLineLength = 0; - u32 m_errorLength = 0; + std::size_t m_longestLineLength = 0; }; -} \ No newline at end of file + +} // namespace pl::core diff --git a/lib/include/pl/core/lexer_sm.hpp b/lib/include/pl/core/lexer_sm.hpp new file mode 100644 index 00000000..2722cee4 --- /dev/null +++ b/lib/include/pl/core/lexer_sm.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace pl::core { + + namespace { + + namespace LexerToken { + + enum { + EndOfFile, NewLine, KWNamedOpTypeConstIdent, SingleLineComment, + MultiLineCommentOpen, MultiLineCommentClose, String, Separator, + Directive, DirectiveType, DirectiveParam, Operator, Char, + Integer, FPNumber + }; + + } + } + + void newLexerBuild(lexertl::state_machine &sm); + +} diff --git a/lib/lexer_gen/CMakeLists.txt b/lib/lexer_gen/CMakeLists.txt new file mode 100644 index 00000000..f94d2f9f --- /dev/null +++ b/lib/lexer_gen/CMakeLists.txt @@ -0,0 +1,12 @@ +set(SOURCES + ../source/pl/core/lexer_sm.cpp + main.cpp +) + +add_executable(lexer_gen ${SOURCES}) + +target_include_directories(lexer_gen PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include +) + +target_include_directories(lexer_gen PRIVATE include ../../external/lexertl17/include) diff --git a/lib/lexer_gen/main.cpp b/lib/lexer_gen/main.cpp new file mode 100644 index 00000000..371ab478 --- /dev/null +++ b/lib/lexer_gen/main.cpp @@ -0,0 +1,39 @@ +/* +Build a "static" lexer in release builds. + +Static in the sense that the state machine is built in a pre-build +step to optimize application start-up time. +*/ +#include + +#include +#include + +#include +#include + +int main(int argc, char *argv[]) +{ + if (argc!=2) + return 1; + + try + { + std::filesystem::path argPath(argv[1]); + std::filesystem::path genDir = argPath.parent_path(); + std::filesystem::create_directory(genDir); + } + catch (const std::filesystem::filesystem_error &fse) + { + return 1; + } + + lexertl::state_machine sm; + pl::core::newLexerBuild(sm); + sm.minimise(); + + std::ofstream ofs(argv[1]); + lexertl::table_based_cpp::generate("lookup", sm, false, ofs); + + return 0; +} diff --git a/lib/source/pl/core/lexer.cpp b/lib/source/pl/core/lexer.cpp index 3f2d3214..86e2cdab 100644 --- a/lib/source/pl/core/lexer.cpp +++ b/lib/source/pl/core/lexer.cpp @@ -1,35 +1,43 @@ +// new_lexer.cpp +// + +#include #include -#include -#include + #include +#include +#include -#include +#include +#include +#include + +#if defined(NDEBUG) + #include +#endif + +#include +#include +#include +#include +#include +#include +// DEBUGGING +#include +#include +// namespace pl::core { - using namespace tkn; - - static constexpr char integerSeparator = '\''; - static bool isIdentifierCharacter(const char c) { - return std::isalnum(c) || c == '_'; - } - - static bool isIntegerCharacter(const char c, const int base) { - switch (base) { - case 16: - return std::isxdigit(c); - case 10: - return std::isdigit(c); - case 8: - return c >= '0' && c <= '7'; - case 2: - return c == '0' || c == '1'; - default: - return false; - } - } + // TODO: + // "integerSeparator_" is defined in "lexer.cpp". + // Remove trailing underscore when it's removed. + static constexpr char integerSeparator_ = '\''; - static int characterValue(const char c) { + // TODO: + // "characterValue" is defined in "lexer.cpp". + // Remove trailing underscore when it's removed. + static int characterValue_(const char c) { if (c >= '0' && c <= '9') { return c - '0'; } @@ -43,163 +51,34 @@ namespace pl::core { return 0; } - static size_t getIntegerLiteralLength(const std::string_view& literal) { - const auto count = literal.find_first_not_of("0123456789ABCDEFabcdef'xXoOpP.uU+-"); - const std::string_view intLiteral = count == std::string_view::npos ? literal : literal.substr(0, count); - if (const auto signPos = intLiteral.find_first_of("+-"); signPos != std::string_view::npos && ((literal.at(signPos-1) != 'e' && literal.at(signPos-1) != 'E') || literal.starts_with("0x"))) - return signPos; - return intLiteral.size(); - } - - - std::optional Lexer::parseCharacter() { - const char& c = m_sourceCode[m_cursor++]; - if (c == '\\') { - switch (m_sourceCode[m_cursor++]) { - case 'a': - return '\a'; - case 'b': - return '\b'; - case 'f': - return '\f'; - case 'n': - return '\n'; - case 't': - return '\t'; - case 'r': - return '\r'; - case '0': - return '\0'; - case '\'': - return '\''; - case '"': - return '"'; - case '\\': - return '\\'; - case 'x': { - const char hex[3] = { m_sourceCode[m_cursor], m_sourceCode[m_cursor + 1], 0 }; - m_cursor += 2; - try { - return static_cast(std::stoul(hex, nullptr, 16)); - } catch (const std::invalid_argument&) { - m_errorLength = 2; - error("Invalid hex escape sequence: {}", hex); - return std::nullopt; - } - } - case 'u': { - const char hex[5] = { m_sourceCode[m_cursor], m_sourceCode[m_cursor + 1], m_sourceCode[m_cursor + 2], - m_sourceCode[m_cursor + 3], 0 }; - m_cursor += 4; - try { - return static_cast(std::stoul(hex, nullptr, 16)); - } catch (const std::invalid_argument&) { - m_errorLength = 4; - error("Invalid unicode escape sequence: {}", hex); - return std::nullopt; - } - } - default: - m_errorLength = 1; - error("Unknown escape sequence: {}", m_sourceCode[m_cursor-1]); - return std::nullopt; - } - } - return c; - } - - std::optional Lexer::parseDirectiveName(const std::string_view &identifier) { - const auto &directives = Token::Directives(); - if (const auto directiveToken = directives.find(identifier); directiveToken != directives.end()) { - return makeToken(directiveToken->second, identifier.length()); - } - m_errorLength = identifier.length(); - error("Unknown directive: {}", identifier); - return std::nullopt; - } - - std::optional Lexer::parseDirectiveValue() { - std::string result; - - m_cursor++; // Skip space - auto location = this->location(); - - while (!std::isblank(m_sourceCode[m_cursor]) && !std::isspace(m_sourceCode[m_cursor]) && m_sourceCode[m_cursor] != '\0' ) { - - auto character = parseCharacter(); - if (!character.has_value()) { - return std::nullopt; - } - - result += character.value(); - } - - if (hasTheLineEnded(m_sourceCode[m_cursor])) - m_cursor++; - - return makeTokenAt(Literal::makeString(result), location, result.size()); - } - - std::optional Lexer::parseDirectiveArgument() { - std::string result; - - m_cursor++; // Skip space - auto location = this->location(); - - while (m_sourceCode[m_cursor] != '\n' && m_sourceCode[m_cursor] != '\r' && m_sourceCode[m_cursor] != '\0') { - - auto character = parseCharacter(); - if (!character.has_value()) { - return std::nullopt; - } - - result += character.value(); + // TODO: + // "isIntegerCharacter" is defined in "lexer.cpp". + // Remove trailing underscore when it's removed. + static bool isIntegerCharacter_(const char c, const int base) { + switch (base) { + case 16: + return std::isxdigit(c); + case 10: + return std::isdigit(c); + case 8: + return c >= '0' && c <= '7'; + case 2: + return c == '0' || c == '1'; + default: + return false; } - - if (hasTheLineEnded(m_sourceCode[m_cursor])) - m_cursor++; - - return makeTokenAt(Literal::makeString(result), location, result.size()); } - std::optional Lexer::parseStringLiteral() { - std::string result; - auto location = this->location(); - - m_cursor++; // Skip opening " - - while (m_sourceCode[m_cursor] != '\"') { - char c = peek(0); - if (c == '\n' || c == '\r') { - m_errorLength = 1; - error("Unexpected newline in string literal"); - m_line++; - m_lineBegin = m_cursor; - return std::nullopt; - } - - if (c == '\0') { - m_errorLength = 1; - error("Unexpected end of file in string literal"); - return std::nullopt; - } - - auto character = parseCharacter(); - if (!character.has_value()) { - return std::nullopt; - } - - result += character.value(); + // TODO: + // Consider making 'location' not use templates. Here and in other functions below. + std::optional Lexer::parseInteger(std::string_view literal, const auto &location) { + const bool isUnsigned = hlp::stringEndsWithOneOf(literal, { "u", "U" }); + if(isUnsigned) { + // remove suffix + literal = literal.substr(0, literal.size() - 1); } - m_cursor++; - - return makeTokenAt(Literal::makeString(result), location, result.size() + 2); - } - - std::optional Lexer::parseInteger(std::string_view literal) { u8 base = 10; - u128 value = 0; if(literal[0] == '0') { if(literal.size() == 1) { @@ -229,26 +108,28 @@ namespace pl::core { } for (const char c : literal) { - if(c == integerSeparator) continue; + if(c == integerSeparator_) continue; - if (!isIntegerCharacter(c, base)) { - m_errorLength = literal.size(); - error("Invalid integer literal: {}", literal); + if (!isIntegerCharacter_(c, base)) { + error(location(), "Invalid integer literal: {}", literal); return std::nullopt; } - value = value * base + characterValue(c); + value = value * base + characterValue_(c); } - return value; + if (isUnsigned) { + return value; + } + + return i128(value); } - std::optional Lexer::parseFloatingPoint(std::string_view literal, const char suffix) { + std::optional Lexer::parseFloatingPoint(std::string_view literal, const char suffix, const auto &location) { char *end = nullptr; double val = std::strtod(literal.data(), &end); if(end != literal.data() + literal.size()) { - m_errorLength = literal.size(); - error("Invalid float literal: {}", literal); + error(location(), "Invalid float literal: {}", literal); return std::nullopt; } @@ -263,425 +144,409 @@ namespace pl::core { } } - std::optional Lexer::parseIntegerLiteral(std::string_view literal) { - // parse a c like numeric literal - const bool floatSuffix = hlp::stringEndsWithOneOf(literal, { "f", "F", "d", "D" }); - const bool unsignedSuffix = hlp::stringEndsWithOneOf(literal, { "u", "U" }); - const bool isFloat = literal.find('.') != std::string_view::npos - || (!literal.starts_with("0x") && floatSuffix); - const bool isUnsigned = unsignedSuffix; - - if(isFloat) { - - char suffix = 0; - if(floatSuffix) { - // remove suffix - suffix = literal.back(); - literal = literal.substr(0, literal.size() - 1); + std::optional Lexer::parseCharacter(const char* &pchar, const char* e, const auto &location) { + const char c = *(pchar++); + if (c == '\\') { + switch (*(pchar++)) { + case 'a': + return '\a'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 't': + return '\t'; + case 'r': + return '\r'; + case '0': + return '\0'; + case '\'': + return '\''; + case '"': + return '"'; + case '\\': + return '\\'; + case 'x': { + if (pchar+1 >= e) { + error(location(), "Incomplete escape sequence"); + return std::nullopt; + } + const char hex[3] = { *pchar, *(pchar+1), 0 }; // TODO: buffer overrun? + pchar += sizeof(hex)-1; + try { + return static_cast(std::stoul(hex, nullptr, 16)); + } catch (const std::invalid_argument&) { + error(location(), "Invalid hex escape sequence: {}", hex); + return std::nullopt; + } + } + case 'u': { + if (pchar+3 >= e) { + error(location(), "Incomplete escape sequence"); + return std::nullopt; + } + const char hex[5] = { *pchar, *(pchar+1), *(pchar+2), *(pchar+3), 0}; + pchar += sizeof(hex)-1; + try { + return static_cast(std::stoul(hex, nullptr, 16)); + } catch (const std::invalid_argument&) { + error(location(), "Invalid unicode escape sequence: {}", hex); + return std::nullopt; + } + } + default: + error(location(), "Unknown escape sequence: {}", pchar); + return std::nullopt; } - - auto floatingPoint = parseFloatingPoint(literal, suffix); - - if(!floatingPoint.has_value()) return std::nullopt; - - return floatingPoint.value(); - - } - - if(unsignedSuffix) { - // remove suffix - literal = literal.substr(0, literal.size() - 1); } - - const auto integer = parseInteger(literal); - - if(!integer.has_value()) return std::nullopt; - - u128 value = integer.value(); - if(isUnsigned) { - return value; - } - - return i128(value); + return c; } - std::optional Lexer::parseOneLineComment() { - auto location = this->location(); - const auto begin = m_cursor; - m_cursor += 2; - + std::optional Lexer::parseStringLiteral(std::string_view literal, const auto &location) + { std::string result; - while(m_sourceCode[m_cursor] != '\n' && m_sourceCode[m_cursor] != '\r' && m_sourceCode[m_cursor] != '\0') { - result += m_sourceCode[m_cursor]; - m_cursor++; - } - auto len = m_cursor - begin; - - if (hasTheLineEnded(m_sourceCode[m_cursor])) - m_cursor++; - - return makeTokenAt(Literal::makeComment(true, result), location, len); - } - std::optional Lexer::parseOneLineDocComment() { - auto location = this->location(); - const auto begin = m_cursor; - m_cursor += 3; - - std::string result; - while(m_sourceCode[m_cursor] != '\n' && m_sourceCode[m_cursor] != '\r' && m_sourceCode[m_cursor] != '\0') { - result += m_sourceCode[m_cursor]; - m_cursor++; + if (literal.size()==0) { + return Token{Token::Type::String, Token::Literal(result), location()}; } - auto len = m_cursor - begin; - - if (hasTheLineEnded(m_sourceCode[m_cursor])) - m_cursor++; - - return makeTokenAt(Literal::makeDocComment(false, true, result), location, len); - } - std::optional Lexer::parseMultiLineDocComment() { - auto location = this->location(); - const auto begin = m_cursor; - const bool global = peek(2) == '!'; - std::string result; - - m_cursor += 3; - while(true) { - hasTheLineEnded(peek(0)); - - if(peek(1) == '\x00') { - m_errorLength = 1; - error("Unexpected end of file while parsing multi line doc comment"); + const char *p = &literal.front(); + const char *e = &literal.back(); // inclusive + while (p<=e) { + auto character = parseCharacter(p, e+1, location); + if (!character.has_value()) { return std::nullopt; } - if(peek(0) == '*' && peek(1) == '/') { - m_cursor += 2; - break; - } - - result += m_sourceCode[m_cursor++]; + result += character.value(); } - return makeTokenAt(Literal::makeDocComment(global, false, result), location, m_cursor - begin); + return Token{Token::Type::String, Token::Literal(result), location()}; } - std::optional Lexer::parseMultiLineComment() { - auto location = this->location(); - const auto begin = m_cursor; - std::string result; + namespace { + bool g_lexerStaticInitDone = false; - m_cursor += 2; - while(true) { - hasTheLineEnded(peek(0)); + // Much of the contents of this anonymous namespace serve as conceptually + // private static members of the Lexer class. They're placed here to avoid + // pulling in unnecessary symbols into every file that includes our header. - if(peek(1) == '\x00') { - m_errorLength = 2; - error("Unexpected end of file while parsing multi line doc comment"); - return std::nullopt; - } + struct KWOpTypeInfo { + Token::Type type; + Token::ValueTypes value; + }; - if(peek(0) == '*' && peek(1) == '/') { - m_cursor += 2; - break; + // This "Trans" stuff is to allow us to use std::string_view to lookup stuff + // so we don't have to construct a std::string. + struct TransHash { + using is_transparent = void; + + std::size_t operator()(const std::string &s) const noexcept { + return std::hash{}(s); } - result += m_sourceCode[m_cursor++]; - } + std::size_t operator()(std::string_view s) const noexcept { + return std::hash{}(s); + } + }; - return makeTokenAt(Literal::makeComment(false, result), location, m_cursor - begin); - } + struct TransEqual { + using is_transparent = void; - std::optional Lexer::parseOperator() { - auto location = this->location(); - const auto begin = m_cursor; - const auto operators = Token::Operators(); - std::optional lastMatch = std::nullopt; - - for (int i = 1; i <= Operator::maxOperatorLength; ++i) { - const auto view = std::string_view { &m_sourceCode[begin], static_cast(i) }; - if (auto operatorToken = operators.find(view); operatorToken != operators.end()) { - m_cursor++; - lastMatch = operatorToken->second; + bool operator()(const std::string &lhs, const std::string &rhs) const noexcept { + return lhs == rhs; } - } - return lastMatch ? makeTokenAt(lastMatch.value(), location, m_cursor - begin) : lastMatch; - } + bool operator()(const std::string &lhs, std::string_view rhs) const noexcept { + return lhs == rhs; + } - std::optional Lexer::parseSeparator() { - auto location = this->location(); - const auto begin = m_cursor; + bool operator()(std::string_view lhs, const std::string& rhs) const noexcept { + return lhs == rhs; + } - if (const auto separatorToken = Token::Separators().find(m_sourceCode[m_cursor]); - separatorToken != Token::Separators().end()) { - m_cursor++; - return makeTokenAt(separatorToken->second, location, m_cursor - begin); + bool operator()(std::string_view lhs, std::string_view rhs) const noexcept { + return lhs == rhs; } + }; - return std::nullopt; - } + std::unordered_map g_KWOpTypeTokenInfo; - std::optional Lexer::parseKeyword(const std::string_view &identifier) { - const auto keywords = Token::Keywords(); - if (const auto keywordToken = keywords.find(identifier); keywordToken != keywords.end()) { - return makeToken(keywordToken->second, identifier.length()); - } - return std::nullopt; - } +#if !defined(NDEBUG) + lexertl::state_machine g_sm; +#endif - std::optional Lexer::parseType(const std::string_view &identifier) { - auto types = Token::Types(); - if (const auto typeToken = types.find(identifier); typeToken != types.end()) { - return makeToken(typeToken->second, identifier.length()); - } - return std::nullopt; - } + } // anonymous namespace - std::optional Lexer::parseNamedOperator(const std::string_view &identifier) { - auto operators = Token::Operators(); - if (const auto operatorToken = operators.find(identifier); operatorToken != operators.end()) { - return makeToken(operatorToken->second, identifier.length()); + void initNewLexer() + { + const auto &keywords = Token::Keywords(); + for (const auto& [key, value] : keywords) + g_KWOpTypeTokenInfo.insert(std::make_pair(key, KWOpTypeInfo{value.type, value.value})); + + const auto &operators = Token::Operators(); + for (const auto& [key, value] : operators) { + g_KWOpTypeTokenInfo.insert(std::make_pair(key, KWOpTypeInfo{value.type, value.value})); } - return std::nullopt; - } - std::optional Lexer::parseConstant(const std::string_view &identifier) { - if (const auto constantToken = constants.find(identifier); constantToken != constants.end()) { - return makeToken(Literal::makeNumeric(constantToken->second), identifier.length()); - } - return std::nullopt; - } + const auto &types = Token::Types(); + for (const auto& [key, value] : types) + g_KWOpTypeTokenInfo.insert(std::make_pair(key, KWOpTypeInfo{value.type, value.value})); - Token Lexer::makeToken(const Token &token, const size_t length) { - auto location = this->location(); - location.length = length; - return { token.type, token.value, location }; +#if !defined(NDEBUG) + newLexerBuild(g_sm); +#endif } - Token Lexer::makeTokenAt(const Token &token, Location& location, const size_t length) { - location.length = length; - return { token.type, token.value, location }; + Lexer::Lexer() { + if (!g_lexerStaticInitDone) { + g_lexerStaticInitDone = true; + initNewLexer(); + } + m_longestLineLength = 0; } - void Lexer::addToken(const Token &token) { - m_tokens.emplace_back(token); + void Lexer::reset() { + m_longestLineLength = 0; + this->m_tokens.clear(); } - hlp::CompileResult> Lexer::lex(const api::Source *source) { - this->m_sourceCode = source->content; - this->m_source = source; - - this->reset(); - - const size_t end = this->m_sourceCode.size(); - - while (this->m_cursor < end) { - const char& c = this->m_sourceCode[this->m_cursor]; - - if (c == '\x00') { - m_longestLineLength = std::max(m_longestLineLength, m_cursor - m_lineBegin); - break; // end of string - } - - if (std::isblank(c) || std::isspace(c)) { - hasTheLineEnded(c); - m_cursor++; - continue; - } + hlp::CompileResult> Lexer::lex(const api::Source *source) + { + reset(); + + std::string::const_iterator contentEnd = source->content.end(); + lexertl::smatch results(source->content.begin(), contentEnd); + + auto lineStart = results.first; + u32 line = 1; + + auto location = [&]() -> Location { + u32 column = results.first-lineStart+1; + size_t errorLength = results.second-results.first; + return Location { source, line, column, errorLength }; + }; + + std::string::const_iterator mlcomentStartRaw; // start of parsed token, no skipping + std::string::const_iterator mlcomentStart; + Location mlcommentLocation; + + enum MLCommentType{ + MLComment, + MLLocalDocComment, + MLGlobalDocComment + }; + + MLCommentType mlcommentType = MLComment; +#if defined(NDEBUG) + lookup(results); +#else + lexertl::lookup(g_sm, results); +#endif + for (;;) + { + if (results.id==LexerToken::EndOfFile) + break; - if(isIdentifierCharacter(c) && !std::isdigit(c)) { - size_t length = 0; - while (isIdentifierCharacter(peek(length))) { - length++; + switch (results.id) { + case (lexertl::smatch::id_type)-1: { + error(location(), "Unexpected character: {}", *results.first); } + break; + case LexerToken::NewLine: { + ++line; + std::size_t len = results.first - lineStart; + m_longestLineLength = std::max(len, m_longestLineLength); + lineStart = results.second; + } + break; + case LexerToken::KWNamedOpTypeConstIdent: + case LexerToken::Operator: { + const std::string_view kw(results.first, results.second); - auto identifier = std::string_view { &m_sourceCode[m_cursor], length }; - - // process keywords, named operators and types - if (processToken(&Lexer::parseKeyword, identifier) || - processToken(&Lexer::parseNamedOperator, identifier) || - processToken(&Lexer::parseType, identifier) || - processToken(&Lexer::parseConstant, identifier)) { - continue; + if (const auto it = g_KWOpTypeTokenInfo.find(kw); it != g_KWOpTypeTokenInfo.end()) { + m_tokens.emplace_back(it->second.type, it->second.value, location()); } + else if (const auto it = tkn::constants.find(kw); it != tkn::constants.end()) { + auto ctok = tkn::Literal::makeNumeric(it->second); + m_tokens.emplace_back(ctok.type, ctok.value, location()); + } + else { + auto idtok = tkn::Literal::makeIdentifier(std::string(kw)); + // TODO: + // It seems the presence of a non-zero length in the location info is being + // used by the pattern editor editor for error highlighting. This makes things + // hard. I am trying to include location info in every token. At the very least + // this could make debugging easier. Who knows, it may have other uses. In the + // mean time hack the length to 0. + auto loc = location(); + loc.length = 0; + m_tokens.emplace_back(idtok.type, idtok.value, loc); + } + } + break; + case LexerToken::SingleLineComment: { + const std::string_view comment(results.first+2, results.second); + if (comment.size() && comment[0]=='/') { + auto ctok = tkn::Literal::makeDocComment(false, true, std::string(comment.substr(1))); + m_tokens.emplace_back(ctok.type, ctok.value, location()); + } + else { + auto ctok = tkn::Literal::makeComment(true, std::string(comment)); + m_tokens.emplace_back(ctok.type, ctok.value, location()); + } + } + break; - // not a predefined token, so it must be an identifier - addToken(makeToken(Literal::makeIdentifier(std::string(identifier)), length)); - this->m_cursor += length; - - continue; - } - - if(std::isdigit(c)) { - auto literal = &m_sourceCode[m_cursor]; - size_t size = getIntegerLiteralLength(literal); - - const auto integer = parseIntegerLiteral({ literal, size }); - - if(integer.has_value()) { - addToken(makeToken(Literal::makeNumeric(integer.value()), size)); - this->m_cursor += size; - continue; + case LexerToken::MultiLineCommentOpen: { + mlcommentType = MLComment; + mlcomentStartRaw = results.first; + mlcomentStart = results.first+2; + mlcommentLocation = location(); + + const std::string_view comment(results.first+2, results.second); + if (comment.size()) { + if (comment[0]=='*') { + mlcommentType = MLLocalDocComment; + ++mlcomentStart; + } + else if (comment[0]=='!') { + mlcommentType = MLGlobalDocComment; + ++mlcomentStart; + } + } } + break; - this->m_cursor += size; - continue; - } + case LexerToken::MultiLineCommentClose: { + mlcommentLocation.length = results.second-mlcomentStartRaw; + const std::string_view comment(mlcomentStart, results.second-2); - // comment cases - if(c == '/') { - const char category = peek(1); - char type = peek(2); - if(category == '/') { - if(type == '/') { - const auto token = parseOneLineDocComment(); - if(token.has_value()) { - addToken(token.value()); + switch (mlcommentType) { + case MLComment: { + auto ctok = tkn::Literal::makeComment(false, std::string(comment)); + m_tokens.emplace_back(ctok.type, ctok.value, mlcommentLocation); + } + break; + case MLLocalDocComment: { + auto ctok = tkn::Literal::makeDocComment(false, false, std::string(comment)); + m_tokens.emplace_back(ctok.type, ctok.value, mlcommentLocation); } - } else { - const auto token = parseOneLineComment(); - if(token.has_value()) { - addToken(token.value()); + break; + case MLGlobalDocComment: { + auto ctok = tkn::Literal::makeDocComment(true, false, std::string(comment)); + m_tokens.emplace_back(ctok.type, ctok.value, mlcommentLocation); } + break; } - continue; } - if(category == '*') { - if (type != '!' && (type != '*' || peek(3) == '/' )) { - const auto token = parseMultiLineComment(); - if(token.has_value()) - addToken(token.value()); - continue; - } - const auto token = parseMultiLineDocComment(); - if(token.has_value()) { - addToken(token.value()); + break; + + case LexerToken::Integer: { + const std::string_view numStr(results.first, results.second); + auto optNum = parseInteger(numStr, location); + if (!optNum.has_value()) { + break; } - continue; + auto ntok = tkn::Literal::makeNumeric(optNum.value()); + m_tokens.emplace_back(ntok.type, ntok.value, location()); } - } + break; - const auto operatorToken = parseOperator(); - if (operatorToken.has_value()) { - addToken(operatorToken.value()); - continue; - } + case LexerToken::FPNumber: { + std::string_view numStr(results.first, results.second); + const bool floatSuffix = hlp::stringEndsWithOneOf(numStr, {"f","F","d","D"}); + char suffix = 0; + if (floatSuffix) { + // remove suffix + suffix = numStr.back(); + numStr = numStr.substr(0, numStr.size()-1); + } + auto num = parseFloatingPoint(numStr, suffix, location); + if (num.has_value()) { + auto ntok = tkn::Literal::makeNumeric(num.value()); + m_tokens.emplace_back(ntok.type, ntok.value, location()); + } - const auto separatorToken = parseSeparator(); - if (separatorToken.has_value()) { - addToken(separatorToken.value()); - continue; - } + } + break; - if (c == '#' && (m_tokens.empty() || m_tokens.back().location.line < m_line)) { - size_t length = 1; - u32 line = m_line; - while (isIdentifierCharacter(peek(length))) - length++; - auto directiveName = std::string_view{&m_sourceCode[m_cursor], length}; - - if (processToken(&Lexer::parseDirectiveName, directiveName)) { - Token::Directive directive = get(m_tokens.back().value); - if (m_line != line || directive == Token::Directive::Define || directive == Token::Directive::Undef || - peek(0) == 0 || directive == Token::Directive::IfDef || directive == Token::Directive::IfNDef || - directive == Token::Directive::EndIf) - continue; - if (hasTheLineEnded(peek(0))) { - m_cursor++; - continue; + case LexerToken::String: { + const std::string_view str(results.first+1, results.second-1); + auto optTok = parseStringLiteral(str, location); + if (optTok.has_value()) { + auto stok = optTok.value(); + m_tokens.emplace_back(stok.type, stok.value, location()); } - auto directiveValue = parseDirectiveValue(); - if (directiveValue.has_value()) { - addToken(directiveValue.value()); - if (m_line != line || peek(0) == 0) - continue; - if (hasTheLineEnded(peek(0))) { - m_cursor++; - continue; + } + break; + case LexerToken::Char: { + const std::string_view ch(results.first+1, results.second-1); + const char *p = &ch[0]; + auto optCh = parseCharacter(p, (&ch.back())+1, location); + if (optCh.has_value()) { + if (p-1 < &ch.back()) { + error(location(), "char literal too long"); + } + else if (p-1 > &ch.back()) { + error(location(), "char literal too short"); } - directiveValue = parseDirectiveArgument(); - if (directiveValue.has_value()) { - addToken(directiveValue.value()); + else { + m_tokens.emplace_back(core::Token::Type::Integer, optCh.value(), location()); } } - continue; } - } - - // literals - if (c == '"') { - const auto string = parseStringLiteral(); - - if (string.has_value()) { - addToken(string.value()); - continue; + break; + case LexerToken::Separator: { + const char sep = *results.first; + const auto separatorToken = Token::Separators().find(sep)->second; + m_tokens.emplace_back(separatorToken.type, separatorToken.value, location()); } - } else if(c == '\'') { - auto location = this->location(); - const auto begin = m_cursor; - m_cursor++; // skip opening ' - const auto character = parseCharacter(); - - if (character.has_value()) { - if(m_sourceCode[m_cursor] != '\'') { - m_errorLength = 1; - error("Expected closing '"); - continue; + break; + case LexerToken::Directive: { + auto first = results.first; + for (++first; std::isspace(static_cast(*first)); ++first) {} + std::string name = "#"+std::string(first, results.second); // TODO: I don't like this! + const auto &directives = Token::Directives(); + if (const auto directiveToken = directives.find(name); directiveToken != directives.end()) { + m_tokens.emplace_back(directiveToken->second.type, directiveToken->second.value, location()); + } + else { + error(location(), "Unknown directive: {}", name); } - - m_cursor++; // skip closing ' - - addToken(makeTokenAt(Literal::makeNumeric(character.value()), location, m_cursor - begin)); - continue; } - } else { - m_errorLength = 1; - error("Unexpected character: {}", c); - m_cursor++; - + break; + case LexerToken::DirectiveType: { + const std::string_view type(results.first, results.second); + const auto stok = tkn::Literal::makeString(std::string(type)); + m_tokens.emplace_back(stok.type, stok.value, location()); + } + break; + case LexerToken::DirectiveParam: { + const std::string_view param(results.first, results.second); + const auto stok = tkn::Literal::makeString(std::string(param)); + m_tokens.emplace_back(stok.type, stok.value, location()); + } break; } - m_cursor++; +#if defined(NDEBUG) + lookup(results); +#else + lexertl::lookup(g_sm, results); +#endif } - m_longestLineLength = std::max(m_longestLineLength, m_cursor - m_lineBegin); - addToken(makeToken(Separator::EndOfProgram, 0)); - - return { m_tokens, collectErrors() }; - } - void Lexer::reset() { - this->m_cursor = 0; - this->m_line = 1; - this->m_lineBegin = 0; - this->m_longestLineLength = 0; - this->m_tokens.clear(); - } + std::size_t len = results.first - lineStart; + m_longestLineLength = std::max(len, m_longestLineLength); + lineStart = results.second; + const auto &eop = tkn::Separator::EndOfProgram; + m_tokens.emplace_back(eop.type, eop.value, location()); - inline char Lexer::peek(const size_t p) const { - return m_cursor + p < m_sourceCode.size() ? m_sourceCode[m_cursor + p] : '\0'; - } - - bool Lexer::processToken(auto parserFunction, const std::string_view& identifier) { - const auto token = (this->*parserFunction)(identifier); - if (token.has_value()) { - m_tokens.emplace_back(token.value()); - m_cursor += identifier.size(); - return true; - } - return false; + return { m_tokens, collectErrors() }; } - Location Lexer::location() { - u32 column = m_cursor - m_lineBegin; - // There is no newline before the first line so add 1 to the column - if(m_line==1) { - column += 1; - } - return Location { m_source, m_line, column, m_errorLength }; - } -} \ No newline at end of file +} // namespace pl::core diff --git a/lib/source/pl/core/lexer_sm.cpp b/lib/source/pl/core/lexer_sm.cpp new file mode 100644 index 00000000..720b09a7 --- /dev/null +++ b/lib/source/pl/core/lexer_sm.cpp @@ -0,0 +1,170 @@ +/* + * Build the lexer state machine. + * + * In debug builds we use the state machine directly. + * + * In Release builds it's used by the pre-build step to generate the "static" + * lexer -- a precompiled implementation without any initialisation overhead. + * + * The lexertl17 lexing library is used. Its GitHub repo can be found here: + * https://github.com/BenHanson/lexertl17 + * + * This file and new_lexer.cpp are the only two files to include lexertl + * (in addition to the lexer_gen project which builds the static lexer). + */ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace pl::core { + + namespace { + // Is c a special regex char? + // Return true if is is so we can escape it. + // Note that lexertl uses flex style regular expressions. + inline bool mustEscape(char c) + { + switch (c) { + case '+': case '-': case '/': case '*': case '?': + case '|': + case '(': case ')': + case '[': case ']': + case '{': case '}': + case '.': + case '^': case '$': + case '\\': + case '"': + return true; + + default: + break; + } + return false; + } + + // Escape special regex chars. + template + inline std::string escapeRegex(const String& s) { + std::string result; + result.reserve(s.size() * 2); + + for (char c : s) { + if (mustEscape(c)) + result += '\\'; + result += c; + } + + return result; + } + + inline std::string escapeRegex(const char *s) { + return escapeRegex(std::string(s)); + } + + } // anonymous namespace + + void newLexerBuild(lexertl::state_machine &sm) + { + try { + lexertl::rules rules; + + // lexertl uses flex style regular expressions. Some pitfalls to look out for: + // - " (quote) is a special character. If you want a literal quote, you need + // to escape it. Anything enclosed in unescaped quotes is interpreted literally + // and not as a regular expression. + // + // - [^xyz]: this will match anything that's not x, y or z. INCLUDING newlines!!! + + rules.insert_macro("NL", R"(\r\n|\n|\r)"); // Newline + rules.insert_macro("HWS", R"([ \t])"); // Horizontal whitespace + + rules.push_state("MLCOMMENT"); // we're lexing a multiline comment + rules.push_state("DIRECTIVETYPE"); + rules.push_state("DIRECTIVEPARAM"); + + // We count newlines to tell what line of the file we're on. + // Care must be taken not to eat newlines in other rules. + // There are other ways to handle this. Boost uses a special + // counting iterator. This is the simplest and should be fast. + rules.push("{NL}", LexerToken::NewLine); + + rules.push("*", "{HWS}+", lexertl::rules::skip(), "."); + + rules.push(R"(\/\/[^\r\n]*)", LexerToken::SingleLineComment); + + // We match multiline comments in two stages. First we match the comment + // opening, and then the comment closing. The full text of the comment is + // composed in the C++ code based on 'first' of the opening and 'second' + // of the closing token. + rules.push("INITIAL", R"(\/\*[^\r\n]?)", LexerToken::MultiLineCommentOpen, "MLCOMMENT"); + rules.push("MLCOMMENT", "{NL}", LexerToken::NewLine, "."); + rules.push("MLCOMMENT", R"([^*\r\n]+|.)", lexertl::rules::skip(), "MLCOMMENT"); + rules.push("MLCOMMENT", R"(\*\/)", LexerToken::MultiLineCommentClose, "INITIAL"); + + rules.push(R"([a-zA-Z_]\w*)", LexerToken::KWNamedOpTypeConstIdent); + + rules.push( + "(" + "([0-9]+\\.[0-9]*|\\.[0-9]+)" // group decimal alternatives here with '|' + "([eE][+-]?[0-9]+)?" // optional exponent + "[fFdD]?" // optional suffix + ")|" + "(" + "[0-9]+[eE][+-]?[0-9]+" // no decimal but exponent required + "[fFdD]?" // optional suffix + ")|" + "(" + "[0-9]+" // no decimal, no exponent + "[fFdD]" // suffix required + ")", + LexerToken::FPNumber + ); + + rules.push("(0[xXoObB])?[0-9a-fA-F]+('[0-9a-fA-F]+)*[uU]?", LexerToken::Integer); + + rules.push(R"(\"([^\"\r\n\\]|\\.)*\")", LexerToken::String); + rules.push(R"('('|(\\'|[^'\r\n])+)')", LexerToken::Char); + + rules.push("INITIAL", R"(#{HWS}*(define|undef|ifdef|ifndef|endif))", LexerToken::Directive, "."); + rules.push("INITIAL", R"(#{HWS}*[a-zA-Z_]\w*)", LexerToken::Directive, "DIRECTIVETYPE"); + rules.push("DIRECTIVETYPE", "{NL}", LexerToken::NewLine, "INITIAL"); + rules.push("DIRECTIVETYPE", R"(\S+)", LexerToken::DirectiveType, "DIRECTIVEPARAM"); + rules.push("DIRECTIVEPARAM", "{NL}", LexerToken::NewLine, "INITIAL"); + rules.push("DIRECTIVEPARAM", R"(\S.*)", LexerToken::DirectiveParam, "INITIAL"); + + // The parser expects >= and <= as two separate tokens. Not sure why. + // I originally intended to handle this differently but this (and other "split tokens") + // make the longest-match rule useless. I will address this when I build a new parser. + const char* ops[] = {"+", "-", "*", "/", "%", "&", "|", "^", "~", "==", "!=", "<", ">", + "&&", "||", "!", "^^", "$", ":", "::", "?", "@", "=", "addressof", + "sizeof", "typenameof"}; + std::ostringstream opsSS; + for (auto op : ops) { + opsSS << escapeRegex(op) << "|"; + } + std::string oprs = opsSS.str(); + oprs.pop_back(); + rules.push(oprs, LexerToken::Operator); + + const std::string sepChars = escapeRegex("(){}[],.;"); + rules.push("["+sepChars+"]", LexerToken::Separator); + + lexertl::generator::build(rules, sm); + } + catch (const lexertl::runtime_error &e) { + [[maybe_unused]] const char *what = e.what(); + assert(!"lexer: looks like a regex is invalid"); + std::abort(); + } + } + +} // namespace pl::core