diff --git a/_automation/treesitter_updater/main.go b/_automation/treesitter_updater/main.go index 95b3c911..a5f3456c 100644 --- a/_automation/treesitter_updater/main.go +++ b/_automation/treesitter_updater/main.go @@ -16,8 +16,10 @@ import ( ) // Constants for the Tree Sitter version and download URL -const sitterVersion = "0.22.5" -const sitterURL = "https://github.com/tree-sitter/tree-sitter/archive/refs/tags/v" + sitterVersion + ".tar.gz" +const ( + sitterVersion = "0.25.9" + sitterURL = "https://github.com/tree-sitter/tree-sitter/archive/refs/tags/v" + sitterVersion + ".tar.gz" +) func main() { // Get the current working directory @@ -40,6 +42,7 @@ func main() { copyFiles(filepath.Join(parentPath, "lib", "src"), filepath.Join(currentDir, "tmpts"), "*.c") copyFiles(filepath.Join(parentPath, "lib", "src"), filepath.Join(currentDir, "tmpts"), "*.h") copyFiles(filepath.Join(parentPath, "lib", "src", "unicode"), filepath.Join(currentDir, "tmpts"), "*.h") + copyFiles(filepath.Join(parentPath, "lib", "src", "portable"), filepath.Join(currentDir, "tmpts"), "*.h") // Remove the original extracted directory err = os.RemoveAll(parentPath) @@ -125,7 +128,7 @@ func copyFile(src, dst string) error { } // Write the file to destination - err = ioutil.WriteFile(dst, input, 0644) + err = ioutil.WriteFile(dst, input, 0o644) if err != nil { return err } @@ -154,6 +157,7 @@ func modifyIncludePaths(path string) error { // Modify the content and write back modifiedContent := strings.ReplaceAll(string(content), `"tree_sitter/`, `"`) modifiedContent = strings.ReplaceAll(modifiedContent, `"unicode/`, `"`) + modifiedContent = strings.ReplaceAll(modifiedContent, `"portable/`, `"`) return os.WriteFile(filePath, []byte(modifiedContent), info.Mode()) }) } @@ -196,7 +200,7 @@ func downloadAndExtractSitter(url, version string) error { // Create directories and files as needed switch header.Typeflag { case tar.TypeDir: - if err := os.MkdirAll(target, 0755); err != nil { + if err := os.MkdirAll(target, 0o755); err != nil { return err } case tar.TypeReg: @@ -230,7 +234,6 @@ func cleanup(path string) { } return nil }) - if err != nil { // Handle the error } diff --git a/alloc.h b/alloc.h index a0eadb7a..a27b8a63 100644 --- a/alloc.h +++ b/alloc.h @@ -15,10 +15,10 @@ extern "C" { #define TS_PUBLIC __attribute__((visibility("default"))) #endif -TS_PUBLIC extern void *(*ts_current_malloc)(size_t); -TS_PUBLIC extern void *(*ts_current_calloc)(size_t, size_t); -TS_PUBLIC extern void *(*ts_current_realloc)(void *, size_t); -TS_PUBLIC extern void (*ts_current_free)(void *); +TS_PUBLIC extern void *(*ts_current_malloc)(size_t size); +TS_PUBLIC extern void *(*ts_current_calloc)(size_t count, size_t size); +TS_PUBLIC extern void *(*ts_current_realloc)(void *ptr, size_t size); +TS_PUBLIC extern void (*ts_current_free)(void *ptr); // Allow clients to override allocation functions #ifndef ts_malloc diff --git a/api.h b/api.h index de122289..2bbfe66f 100644 --- a/api.h +++ b/api.h @@ -7,14 +7,14 @@ #endif #endif -#ifdef __cplusplus -extern "C" { -#endif - #include #include #include +#ifdef __cplusplus +extern "C" { +#endif + /****************************/ /* Section - ABI Versioning */ /****************************/ @@ -26,7 +26,7 @@ extern "C" { * The Tree-sitter library is generally backwards-compatible with languages * generated using older CLI versions, but is not forwards-compatible. */ -#define TREE_SITTER_LANGUAGE_VERSION 14 +#define TREE_SITTER_LANGUAGE_VERSION 15 /** * The earliest ABI version that is supported by the current version of the @@ -48,14 +48,26 @@ typedef struct TSQuery TSQuery; typedef struct TSQueryCursor TSQueryCursor; typedef struct TSLookaheadIterator TSLookaheadIterator; +// This function signature reads one code point from the given string, +// returning the number of bytes consumed. It should write the code point +// to the `code_point` pointer, or write -1 if the input is invalid. +typedef uint32_t (*DecodeFunction)( + const uint8_t *string, + uint32_t length, + int32_t *code_point +); + typedef enum TSInputEncoding { TSInputEncodingUTF8, - TSInputEncodingUTF16, + TSInputEncodingUTF16LE, + TSInputEncodingUTF16BE, + TSInputEncodingCustom } TSInputEncoding; typedef enum TSSymbolType { TSSymbolTypeRegular, TSSymbolTypeAnonymous, + TSSymbolTypeSupertype, TSSymbolTypeAuxiliary, } TSSymbolType; @@ -75,8 +87,20 @@ typedef struct TSInput { void *payload; const char *(*read)(void *payload, uint32_t byte_index, TSPoint position, uint32_t *bytes_read); TSInputEncoding encoding; + DecodeFunction decode; } TSInput; +typedef struct TSParseState { + void *payload; + uint32_t current_byte_offset; + bool has_error; +} TSParseState; + +typedef struct TSParseOptions { + void *payload; + bool (*progress_callback)(TSParseState *state); +} TSParseOptions; + typedef enum TSLogType { TSLogTypeParse, TSLogTypeLex, @@ -149,6 +173,30 @@ typedef enum TSQueryError { TSQueryErrorLanguage, } TSQueryError; +typedef struct TSQueryCursorState { + void *payload; + uint32_t current_byte_offset; +} TSQueryCursorState; + +typedef struct TSQueryCursorOptions { + void *payload; + bool (*progress_callback)(TSQueryCursorState *state); +} TSQueryCursorOptions; + +/** + * The metadata associated with a language. + * + * Currently, this metadata can be used to check the [Semantic Version](https://semver.org/) + * of the language. This version information should be used to signal if a given parser might + * be incompatible with existing queries when upgrading between major versions, or minor versions + * if it's in zerover. + */ +typedef struct TSLanguageMetadata { + uint8_t major_version; + uint8_t minor_version; + uint8_t patch_version; +} TSLanguageMetadata; + /********************/ /* Section - Parser */ /********************/ @@ -174,7 +222,7 @@ const TSLanguage *ts_parser_language(const TSParser *self); * Returns a boolean indicating whether or not the language was successfully * assigned. True means assignment succeeded. False means there was a version * mismatch: the language was generated with an incompatible version of the - * Tree-sitter CLI. Check the language's version using [`ts_language_version`] + * Tree-sitter CLI. Check the language's ABI version using [`ts_language_abi_version`] * and compare it to this library's [`TREE_SITTER_LANGUAGE_VERSION`] and * [`TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION`] constants. */ @@ -245,7 +293,7 @@ const TSRange *ts_parser_included_ranges( * `TSInputEncodingUTF8` or `TSInputEncodingUTF16`. * * This function returns a syntax tree on success, and `NULL` on failure. There - * are three possible reasons for failure: + * are four possible reasons for failure: * 1. The parser does not have a language assigned. Check for this using the [`ts_parser_language`] function. * 2. Parsing was cancelled due to a timeout that was set by an earlier call to @@ -257,6 +305,8 @@ const TSRange *ts_parser_included_ranges( * earlier call to [`ts_parser_set_cancellation_flag`]. You can resume parsing * from where the parser left out by calling [`ts_parser_parse`] again with * the same arguments. + * 4. Parsing was cancelled due to the progress callback returning true. This callback + * is passed in [`ts_parser_parse_with_options`] inside the [`TSParseOptions`] struct. * * [`read`]: TSInput::read * [`payload`]: TSInput::payload @@ -269,6 +319,20 @@ TSTree *ts_parser_parse( TSInput input ); +/** + * Use the parser to parse some source code and create a syntax tree, with some options. + * + * See [`ts_parser_parse`] for more details. + * + * See [`TSParseOptions`] for more details on the options. + */ +TSTree* ts_parser_parse_with_options( + TSParser *self, + const TSTree *old_tree, + TSInput input, + TSParseOptions parse_options +); + /** * Use the parser to parse some source code stored in one contiguous buffer. * The first two parameters are the same as in the [`ts_parser_parse`] function @@ -308,6 +372,8 @@ TSTree *ts_parser_parse_string_encoding( void ts_parser_reset(TSParser *self); /** + * @deprecated use [`ts_parser_parse_with_options`] and pass in a callback instead, this will be removed in 0.26. + * * Set the maximum duration in microseconds that parsing should be allowed to * take before halting. * @@ -317,11 +383,15 @@ void ts_parser_reset(TSParser *self); void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout_micros); /** + * @deprecated use [`ts_parser_parse_with_options`] and pass in a callback instead, this will be removed in 0.26. + * * Get the duration in microseconds that parsing is allowed to take. */ uint64_t ts_parser_timeout_micros(const TSParser *self); /** + * @deprecated use [`ts_parser_parse_with_options`] and pass in a callback instead, this will be removed in 0.26. + * * Set the parser's current cancellation flag pointer. * * If a non-null pointer is assigned, then the parser will periodically read @@ -331,6 +401,8 @@ uint64_t ts_parser_timeout_micros(const TSParser *self); void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag); /** + * @deprecated use [`ts_parser_parse_with_options`] and pass in a callback instead, this will be removed in 0.26. + * * Get the parser's current cancellation flag pointer. */ const size_t *ts_parser_cancellation_flag(const TSParser *self); @@ -420,6 +492,13 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit); * You need to pass the old tree that was passed to parse, as well as the new * tree that was returned from that function. * + * The returned ranges indicate areas where the hierarchical structure of syntax + * nodes (from root to leaf) has changed between the old and new trees. Characters + * outside these ranges have identical ancestor nodes in both trees. + * + * Note that the returned ranges may be slightly larger than the exact changed areas, + * but Tree-sitter attempts to make them as small as possible. + * * The returned array is allocated using `malloc` and the caller is responsible * for freeing it using `free`. The length of the array will be written to the * given `length` pointer. @@ -548,9 +627,18 @@ TSStateId ts_node_next_parse_state(TSNode self); /** * Get the node's immediate parent. + * Prefer [`ts_node_child_with_descendant`] for + * iterating over the node's ancestors. */ TSNode ts_node_parent(TSNode self); +/** + * Get the node that contains `descendant`. + * + * Note that this can return `descendant` itself. + */ +TSNode ts_node_child_with_descendant(TSNode self, TSNode descendant); + /** * Get the node's child at the given index, where zero represents the first * child. @@ -563,6 +651,12 @@ TSNode ts_node_child(TSNode self, uint32_t child_index); */ const char *ts_node_field_name_for_child(TSNode self, uint32_t child_index); +/** + * Get the field name for node's named child at the given index, where zero + * represents the first named child. Returns NULL, if no field is found. + */ +const char *ts_node_field_name_for_named_child(TSNode self, uint32_t named_child_index); + /** * Get the node's number of children. */ @@ -612,12 +706,12 @@ TSNode ts_node_next_named_sibling(TSNode self); TSNode ts_node_prev_named_sibling(TSNode self); /** - * Get the node's first child that extends beyond the given byte offset. + * Get the node's first child that contains or starts after the given byte offset. */ TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte); /** - * Get the node's first named child that extends beyond the given byte offset. + * Get the node's first named child that contains or starts after the given byte offset. */ TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte); @@ -666,6 +760,9 @@ bool ts_node_eq(TSNode self, TSNode other); * A tree cursor allows you to walk a syntax tree more efficiently than is * possible using the [`TSNode`] functions. It is a mutable object that is always * on a certain syntax node, and can be moved imperatively to different nodes. + * + * Note that the given node is considered the root of the cursor, + * and the cursor cannot walk outside this node. */ TSTreeCursor ts_tree_cursor_new(TSNode node); @@ -675,7 +772,8 @@ TSTreeCursor ts_tree_cursor_new(TSNode node); void ts_tree_cursor_delete(TSTreeCursor *self); /** - * Re-initialize a tree cursor to start at a different node. + * Re-initialize a tree cursor to start at the original node that the cursor was + * constructed with. */ void ts_tree_cursor_reset(TSTreeCursor *self, TSNode node); @@ -713,6 +811,9 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *self); * * This returns `true` if the cursor successfully moved, and returns `false` * if there was no parent node (the cursor was already on the root node). + * + * Note that the node the cursor was constructed with is considered the root + * of the cursor, and the cursor cannot walk outside this node. */ bool ts_tree_cursor_goto_parent(TSTreeCursor *self); @@ -721,6 +822,9 @@ bool ts_tree_cursor_goto_parent(TSTreeCursor *self); * * This returns `true` if the cursor successfully moved, and returns `false` * if there was no next sibling node. + * + * Note that the node the cursor was constructed with is considered the root + * of the cursor, and the cursor cannot walk outside this node. */ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *self); @@ -732,8 +836,10 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *self); * * Note, that this function may be slower than * [`ts_tree_cursor_goto_next_sibling`] due to how node positions are stored. In - * the worst case, this will need to iterate through all the children upto the - * previous sibling node to recalculate its position. + * the worst case, this will need to iterate through all the children up to the + * previous sibling node to recalculate its position. Also note that the node the cursor + * was constructed with is considered the root of the cursor, and the cursor cannot + * walk outside this node. */ bool ts_tree_cursor_goto_previous_sibling(TSTreeCursor *self); @@ -777,7 +883,7 @@ uint32_t ts_tree_cursor_current_descendant_index(const TSTreeCursor *self); uint32_t ts_tree_cursor_current_depth(const TSTreeCursor *self); /** - * Move the cursor to the first child of its current node that extends beyond + * Move the cursor to the first child of its current node that contains or starts after * the given byte offset or point. * * This returns the index of the child node if one was found, and returns -1 @@ -831,6 +937,14 @@ uint32_t ts_query_string_count(const TSQuery *self); */ uint32_t ts_query_start_byte_for_pattern(const TSQuery *self, uint32_t pattern_index); +/** + * Get the byte offset where the given pattern ends in the query's source. + * + * This can be useful when combining queries by concatenating their source + * code strings. + */ +uint32_t ts_query_end_byte_for_pattern(const TSQuery *self, uint32_t pattern_index); + /** * Get all of the predicates for the given pattern in the query. * @@ -952,6 +1066,16 @@ void ts_query_cursor_delete(TSQueryCursor *self); */ void ts_query_cursor_exec(TSQueryCursor *self, const TSQuery *query, TSNode node); +/** + * Start running a given query on a given node, with some options. + */ +void ts_query_cursor_exec_with_options( + TSQueryCursor *self, + const TSQuery *query, + TSNode node, + const TSQueryCursorOptions *query_options +); + /** * Manage the maximum number of in-progress matches allowed by this query * cursor. @@ -968,11 +1092,58 @@ uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self); void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit); /** - * Set the range of bytes or (row, column) positions in which the query - * will be executed. + * @deprecated use [`ts_query_cursor_exec_with_options`] and pass in a callback instead, this will be removed in 0.26. + * + * Set the maximum duration in microseconds that query execution should be allowed to + * take before halting. + * + * If query execution takes longer than this, it will halt early, returning NULL. + * See [`ts_query_cursor_next_match`] or [`ts_query_cursor_next_capture`] for more information. + */ +void ts_query_cursor_set_timeout_micros(TSQueryCursor *self, uint64_t timeout_micros); + +/** + * @deprecated use [`ts_query_cursor_exec_with_options`] and pass in a callback instead, this will be removed in 0.26. + * + * Get the duration in microseconds that query execution is allowed to take. + * + * This is set via [`ts_query_cursor_set_timeout_micros`]. + */ +uint64_t ts_query_cursor_timeout_micros(const TSQueryCursor *self); + +/** + * Set the range of bytes in which the query will be executed. + * + * The query cursor will return matches that intersect with the given point range. + * This means that a match may be returned even if some of its captures fall + * outside the specified range, as long as at least part of the match + * overlaps with the range. + * + * For example, if a query pattern matches a node that spans a larger area + * than the specified range, but part of that node intersects with the range, + * the entire match will be returned. + * + * This will return `false` if the start byte is greater than the end byte, otherwise + * it will return `true`. */ -void ts_query_cursor_set_byte_range(TSQueryCursor *self, uint32_t start_byte, uint32_t end_byte); -void ts_query_cursor_set_point_range(TSQueryCursor *self, TSPoint start_point, TSPoint end_point); +bool ts_query_cursor_set_byte_range(TSQueryCursor *self, uint32_t start_byte, uint32_t end_byte); + +/** + * Set the range of (row, column) positions in which the query will be executed. + * + * The query cursor will return matches that intersect with the given point range. + * This means that a match may be returned even if some of its captures fall + * outside the specified range, as long as at least part of the match + * overlaps with the range. + * + * For example, if a query pattern matches a node that spans a larger area + * than the specified range, but part of that node intersects with the range, + * the entire match will be returned. + * + * This will return `false` if the start point is greater than the end point, otherwise + * it will return `true`. + */ +bool ts_query_cursor_set_point_range(TSQueryCursor *self, TSPoint start_point, TSPoint end_point); /** * Advance to the next match of the currently running query. @@ -987,7 +1158,7 @@ void ts_query_cursor_remove_match(TSQueryCursor *self, uint32_t match_id); * Advance to the next capture of the currently running query. * * If there is a capture, write its match to `*match` and its index within - * the matche's capture list to `*capture_index`. Otherwise, return `false`. + * the match's capture list to `*capture_index`. Otherwise, return `false`. */ bool ts_query_cursor_next_capture( TSQueryCursor *self, @@ -1036,11 +1207,6 @@ uint32_t ts_language_symbol_count(const TSLanguage *self); */ uint32_t ts_language_state_count(const TSLanguage *self); -/** - * Get a node type string for the given numerical id. - */ -const char *ts_language_symbol_name(const TSLanguage *self, TSSymbol symbol); - /** * Get the numerical id for the given node type string. */ @@ -1066,6 +1232,27 @@ const char *ts_language_field_name_for_id(const TSLanguage *self, TSFieldId id); */ TSFieldId ts_language_field_id_for_name(const TSLanguage *self, const char *name, uint32_t name_length); +/** + * Get a list of all supertype symbols for the language. +*/ +const TSSymbol *ts_language_supertypes(const TSLanguage *self, uint32_t *length); + +/** + * Get a list of all subtype symbol ids for a given supertype symbol. + * + * See [`ts_language_supertypes`] for fetching all supertype symbols. + */ +const TSSymbol *ts_language_subtypes( + const TSLanguage *self, + TSSymbol supertype, + uint32_t *length +); + +/** + * Get a node type string for the given numerical id. + */ +const char *ts_language_symbol_name(const TSLanguage *self, TSSymbol symbol); + /** * Check whether the given node type id belongs to named nodes, anonymous nodes, * or a hidden nodes. @@ -1075,6 +1262,8 @@ TSFieldId ts_language_field_id_for_name(const TSLanguage *self, const char *name TSSymbolType ts_language_symbol_type(const TSLanguage *self, TSSymbol symbol); /** + * @deprecated use [`ts_language_abi_version`] instead, this will be removed in 0.26. + * * Get the ABI version number for this language. This version number is used * to ensure that languages were generated by a compatible version of * Tree-sitter. @@ -1083,6 +1272,24 @@ TSSymbolType ts_language_symbol_type(const TSLanguage *self, TSSymbol symbol); */ uint32_t ts_language_version(const TSLanguage *self); +/** + * Get the ABI version number for this language. This version number is used + * to ensure that languages were generated by a compatible version of + * Tree-sitter. + * + * See also [`ts_parser_set_language`]. + */ +uint32_t ts_language_abi_version(const TSLanguage *self); + +/** + * Get the metadata for this language. This information is generated by the + * CLI, and relies on the language author providing the correct metadata in + * the language's `tree-sitter.json` file. + * + * See also [`TSMetadata`]. + */ +const TSLanguageMetadata *ts_language_metadata(const TSLanguage *self); + /** * Get the next parse state. Combine this with lookahead iterators to generate * completion suggestions or valid symbols in error nodes. Use @@ -1090,6 +1297,11 @@ uint32_t ts_language_version(const TSLanguage *self); */ TSStateId ts_language_next_state(const TSLanguage *self, TSStateId state, TSSymbol symbol); +/** + * Get the name of this language. This returns `NULL` in older parsers. + */ +const char *ts_language_name(const TSLanguage *self); + /********************************/ /* Section - Lookahead Iterator */ /********************************/ diff --git a/array.h b/array.h index 15a3b233..d965c617 100644 --- a/array.h +++ b/array.h @@ -6,14 +6,15 @@ extern "C" { #endif #include "./alloc.h" +#include "./ts_assert.h" -#include #include #include #include #include #ifdef _MSC_VER +#pragma warning(push) #pragma warning(disable : 4101) #elif defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push @@ -37,7 +38,7 @@ extern "C" { /// Get a pointer to the element at a given `index` in the array. #define array_get(self, _index) \ - (assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) + (ts_assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) /// Get a pointer to the first element in the array. #define array_front(self) array_get(self, 0) @@ -171,7 +172,7 @@ static inline void _array__delete(Array *self) { /// This is not what you're looking for, see `array_erase`. static inline void _array__erase(Array *self, size_t element_size, uint32_t index) { - assert(index < self->size); + ts_assert(index < self->size); char *contents = (char *)self->contents; memmove(contents + index * element_size, contents + (index + 1) * element_size, (self->size - index - 1) * element_size); @@ -222,7 +223,7 @@ static inline void _array__splice(Array *self, size_t element_size, uint32_t new_size = self->size + new_count - old_count; uint32_t old_end = index + old_count; uint32_t new_end = index + new_count; - assert(old_end <= self->size); + ts_assert(old_end <= self->size); _array__reserve(self, element_size, new_size); @@ -278,7 +279,7 @@ static inline void _array__splice(Array *self, size_t element_size, #define _compare_int(a, b) ((int)*(a) - (int)(b)) #ifdef _MSC_VER -#pragma warning(default : 4101) +#pragma warning(pop) #elif defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/clock.h b/clock.h index 6e75729e..7a13185e 100644 --- a/clock.h +++ b/clock.h @@ -49,9 +49,9 @@ static inline bool clock_is_gt(TSClock self, TSClock other) { return self > other; } -#elif defined(CLOCK_MONOTONIC) && !defined(__APPLE__) +#elif defined(CLOCK_MONOTONIC) -// POSIX with monotonic clock support (Linux) +// POSIX with monotonic clock support (Linux, macOS) // * Represent a time as a monotonic (seconds, nanoseconds) pair. // * Represent a duration as a number of microseconds. // @@ -91,7 +91,7 @@ static inline TSClock clock_after(TSClock base, TSDuration duration) { } static inline bool clock_is_null(TSClock self) { - return !self.tv_sec; + return !self.tv_sec && !self.tv_nsec; } static inline bool clock_is_gt(TSClock self, TSClock other) { @@ -102,7 +102,7 @@ static inline bool clock_is_gt(TSClock self, TSClock other) { #else -// macOS or POSIX without monotonic clock support +// POSIX without monotonic clock support // * Represent a time as a process clock value. // * Represent a duration as a number of process clock ticks. // diff --git a/endian.h b/endian.h new file mode 100644 index 00000000..a6560826 --- /dev/null +++ b/endian.h @@ -0,0 +1,241 @@ +// "License": Public Domain +// I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like. +// In case there are jurisdictions that don't support putting things in the public domain you can also consider it to +// be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it +// an example on how to get the endian conversion functions on different platforms. + +// updates from https://github.com/mikepb/endian.h/issues/4 + +#ifndef ENDIAN_H +#define ENDIAN_H + +#if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__) + +# define __WINDOWS__ + +#endif + +#if defined(HAVE_ENDIAN_H) || \ + defined(__linux__) || \ + defined(__GNU__) || \ + defined(__HAIKU__) || \ + defined(__illumos__) || \ + defined(__NetBSD__) || \ + defined(__OpenBSD__) || \ + defined(__CYGWIN__) || \ + defined(__MSYS__) || \ + defined(__EMSCRIPTEN__) || \ + defined(__wasi__) || \ + defined(__wasm__) + +#if defined(__NetBSD__) +#define _NETBSD_SOURCE 1 +#endif + +# include + +#elif defined(HAVE_SYS_ENDIAN_H) || \ + defined(__FreeBSD__) || \ + defined(__DragonFly__) + +# include + +#elif defined(__APPLE__) +# define __BYTE_ORDER BYTE_ORDER +# define __BIG_ENDIAN BIG_ENDIAN +# define __LITTLE_ENDIAN LITTLE_ENDIAN +# define __PDP_ENDIAN PDP_ENDIAN + +# if !defined(_POSIX_C_SOURCE) +# include + +# define htobe16(x) OSSwapHostToBigInt16(x) +# define htole16(x) OSSwapHostToLittleInt16(x) +# define be16toh(x) OSSwapBigToHostInt16(x) +# define le16toh(x) OSSwapLittleToHostInt16(x) + +# define htobe32(x) OSSwapHostToBigInt32(x) +# define htole32(x) OSSwapHostToLittleInt32(x) +# define be32toh(x) OSSwapBigToHostInt32(x) +# define le32toh(x) OSSwapLittleToHostInt32(x) + +# define htobe64(x) OSSwapHostToBigInt64(x) +# define htole64(x) OSSwapHostToLittleInt64(x) +# define be64toh(x) OSSwapBigToHostInt64(x) +# define le64toh(x) OSSwapLittleToHostInt64(x) +# else +# if BYTE_ORDER == LITTLE_ENDIAN +# define htobe16(x) __builtin_bswap16(x) +# define htole16(x) (x) +# define be16toh(x) __builtin_bswap16(x) +# define le16toh(x) (x) + +# define htobe32(x) __builtin_bswap32(x) +# define htole32(x) (x) +# define be32toh(x) __builtin_bswap32(x) +# define le32toh(x) (x) + +# define htobe64(x) __builtin_bswap64(x) +# define htole64(x) (x) +# define be64toh(x) __builtin_bswap64(x) +# define le64toh(x) (x) +# elif BYTE_ORDER == BIG_ENDIAN +# define htobe16(x) (x) +# define htole16(x) __builtin_bswap16(x) +# define be16toh(x) (x) +# define le16toh(x) __builtin_bswap16(x) + +# define htobe32(x) (x) +# define htole32(x) __builtin_bswap32(x) +# define be32toh(x) (x) +# define le32toh(x) __builtin_bswap32(x) + +# define htobe64(x) (x) +# define htole64(x) __builtin_bswap64(x) +# define be64toh(x) (x) +# define le64toh(x) __builtin_bswap64(x) +# else +# error byte order not supported +# endif +# endif + +#elif defined(__WINDOWS__) + +# if defined(_MSC_VER) && !defined(__clang__) +# include +# define B_SWAP_16(x) _byteswap_ushort(x) +# define B_SWAP_32(x) _byteswap_ulong(x) +# define B_SWAP_64(x) _byteswap_uint64(x) +# else +# define B_SWAP_16(x) __builtin_bswap16(x) +# define B_SWAP_32(x) __builtin_bswap32(x) +# define B_SWAP_64(x) __builtin_bswap64(x) +# endif + +# if defined(__MINGW32__) || defined(HAVE_SYS_PARAM_H) +# include +# endif + +# ifndef BIG_ENDIAN +# ifdef __BIG_ENDIAN +# define BIG_ENDIAN __BIG_ENDIAN +# elif defined(__ORDER_BIG_ENDIAN__) +# define BIG_ENDIAN __ORDER_BIG_ENDIAN__ +# else +# define BIG_ENDIAN 4321 +# endif +# endif + +# ifndef LITTLE_ENDIAN +# ifdef __LITTLE_ENDIAN +# define LITTLE_ENDIAN __LITTLE_ENDIAN +# elif defined(__ORDER_LITTLE_ENDIAN__) +# define LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__ +# else +# define LITTLE_ENDIAN 1234 +# endif +# endif + +# ifndef BYTE_ORDER +# ifdef __BYTE_ORDER +# define BYTE_ORDER __BYTE_ORDER +# elif defined(__BYTE_ORDER__) +# define BYTE_ORDER __BYTE_ORDER__ +# else + /* assume LE on Windows if nothing was defined */ +# define BYTE_ORDER LITTLE_ENDIAN +# endif +# endif + +# if BYTE_ORDER == LITTLE_ENDIAN + +# define htobe16(x) B_SWAP_16(x) +# define htole16(x) (x) +# define be16toh(x) B_SWAP_16(x) +# define le16toh(x) (x) + +# define htobe32(x) B_SWAP_32(x) +# define htole32(x) (x) +# define be32toh(x) B_SWAP_32(x) +# define le32toh(x) (x) + +# define htobe64(x) B_SWAP_64(x) +# define htole64(x) (x) +# define be64toh(x) B_SWAP_64(x) +# define le64toh(x) (x) + +# elif BYTE_ORDER == BIG_ENDIAN + +# define htobe16(x) (x) +# define htole16(x) B_SWAP_16(x) +# define be16toh(x) (x) +# define le16toh(x) B_SWAP_16(x) + +# define htobe32(x) (x) +# define htole32(x) B_SWAP_32(x) +# define be32toh(x) (x) +# define le32toh(x) B_SWAP_32(x) + +# define htobe64(x) (x) +# define htole64(x) B_SWAP_64(x) +# define be64toh(x) (x) +# define le64toh(x) B_SWAP_64(x) + +# else + +# error byte order not supported + +# endif + +#elif defined(__QNXNTO__) + +# include + +# define __LITTLE_ENDIAN 1234 +# define __BIG_ENDIAN 4321 +# define __PDP_ENDIAN 3412 + +# if defined(__BIGENDIAN__) + +# define __BYTE_ORDER __BIG_ENDIAN + +# define htobe16(x) (x) +# define htobe32(x) (x) +# define htobe64(x) (x) + +# define htole16(x) ENDIAN_SWAP16(x) +# define htole32(x) ENDIAN_SWAP32(x) +# define htole64(x) ENDIAN_SWAP64(x) + +# elif defined(__LITTLEENDIAN__) + +# define __BYTE_ORDER __LITTLE_ENDIAN + +# define htole16(x) (x) +# define htole32(x) (x) +# define htole64(x) (x) + +# define htobe16(x) ENDIAN_SWAP16(x) +# define htobe32(x) ENDIAN_SWAP32(x) +# define htobe64(x) ENDIAN_SWAP64(x) + +# else + +# error byte order not supported + +# endif + +# define be16toh(x) ENDIAN_BE16(x) +# define be32toh(x) ENDIAN_BE32(x) +# define be64toh(x) ENDIAN_BE64(x) +# define le16toh(x) ENDIAN_LE16(x) +# define le32toh(x) ENDIAN_LE32(x) +# define le64toh(x) ENDIAN_LE64(x) + +#else + +# error platform not supported + +#endif + +#endif diff --git a/get_changed_ranges.c b/get_changed_ranges.c index bcf8da94..11084c33 100644 --- a/get_changed_ranges.c +++ b/get_changed_ranges.c @@ -3,7 +3,7 @@ #include "./language.h" #include "./error_costs.h" #include "./tree_cursor.h" -#include +#include "./ts_assert.h" // #define DEBUG_GET_CHANGED_RANGES @@ -34,7 +34,7 @@ bool ts_range_array_intersects( uint32_t end_byte ) { for (unsigned i = start_index; i < self->size; i++) { - TSRange *range = &self->contents[i]; + TSRange *range = array_get(self, i); if (range->end_byte > start_byte) { if (range->start_byte >= end_byte) break; return true; @@ -108,6 +108,7 @@ typedef struct { const TSLanguage *language; unsigned visible_depth; bool in_padding; + Subtree prev_external_token; } Iterator; static Iterator iterator_new( @@ -127,6 +128,7 @@ static Iterator iterator_new( .language = language, .visible_depth = 1, .in_padding = false, + .prev_external_token = NULL_SUBTREE, }; } @@ -157,7 +159,7 @@ static bool iterator_tree_is_visible(const Iterator *self) { TreeCursorEntry entry = *array_back(&self->cursor.stack); if (ts_subtree_visible(*entry.subtree)) return true; if (self->cursor.stack.size > 1) { - Subtree parent = *self->cursor.stack.contents[self->cursor.stack.size - 2].subtree; + Subtree parent = *array_get(&self->cursor.stack, self->cursor.stack.size - 2)->subtree; return ts_language_alias_at( self->language, parent.ptr->production_id, @@ -181,10 +183,10 @@ static void iterator_get_visible_state( } for (; i + 1 > 0; i--) { - TreeCursorEntry entry = self->cursor.stack.contents[i]; + TreeCursorEntry entry = *array_get(&self->cursor.stack, i); if (i > 0) { - const Subtree *parent = self->cursor.stack.contents[i - 1].subtree; + const Subtree *parent = array_get(&self->cursor.stack, i - 1)->subtree; *alias_symbol = ts_language_alias_at( self->language, parent->ptr->production_id, @@ -244,6 +246,10 @@ static bool iterator_descend(Iterator *self, uint32_t goal_position) { position = child_right; if (!ts_subtree_extra(*child)) structural_child_index++; + Subtree last_external_token = ts_subtree_last_external_token(*child); + if (last_external_token.ptr) { + self->prev_external_token = last_external_token; + } } } while (did_descend); @@ -268,6 +274,10 @@ static void iterator_advance(Iterator *self) { const Subtree *parent = array_back(&self->cursor.stack)->subtree; uint32_t child_index = entry.child_index + 1; + Subtree last_external_token = ts_subtree_last_external_token(*entry.subtree); + if (last_external_token.ptr) { + self->prev_external_token = last_external_token; + } if (ts_subtree_child_count(*parent) > child_index) { Length position = length_add(entry.position, ts_subtree_total_size(*entry.subtree)); uint32_t structural_child_index = entry.structural_child_index; @@ -313,29 +323,41 @@ static IteratorComparison iterator_compare( TSSymbol new_alias_symbol = 0; iterator_get_visible_state(old_iter, &old_tree, &old_alias_symbol, &old_start); iterator_get_visible_state(new_iter, &new_tree, &new_alias_symbol, &new_start); + TSSymbol old_symbol = ts_subtree_symbol(old_tree); + TSSymbol new_symbol = ts_subtree_symbol(new_tree); if (!old_tree.ptr && !new_tree.ptr) return IteratorMatches; if (!old_tree.ptr || !new_tree.ptr) return IteratorDiffers; + if (old_alias_symbol != new_alias_symbol || old_symbol != new_symbol) return IteratorDiffers; + + uint32_t old_size = ts_subtree_size(old_tree).bytes; + uint32_t new_size = ts_subtree_size(new_tree).bytes; + TSStateId old_state = ts_subtree_parse_state(old_tree); + TSStateId new_state = ts_subtree_parse_state(new_tree); + bool old_has_external_tokens = ts_subtree_has_external_tokens(old_tree); + bool new_has_external_tokens = ts_subtree_has_external_tokens(new_tree); + uint32_t old_error_cost = ts_subtree_error_cost(old_tree); + uint32_t new_error_cost = ts_subtree_error_cost(new_tree); if ( - old_alias_symbol == new_alias_symbol && - ts_subtree_symbol(old_tree) == ts_subtree_symbol(new_tree) + old_start != new_start || + old_symbol == ts_builtin_sym_error || + old_size != new_size || + old_state == TS_TREE_STATE_NONE || + new_state == TS_TREE_STATE_NONE || + ((old_state == ERROR_STATE) != (new_state == ERROR_STATE)) || + old_error_cost != new_error_cost || + old_has_external_tokens != new_has_external_tokens || + ts_subtree_has_changes(old_tree) || + ( + old_has_external_tokens && + !ts_subtree_external_scanner_state_eq(old_iter->prev_external_token, new_iter->prev_external_token) + ) ) { - if (old_start == new_start && - !ts_subtree_has_changes(old_tree) && - ts_subtree_symbol(old_tree) != ts_builtin_sym_error && - ts_subtree_size(old_tree).bytes == ts_subtree_size(new_tree).bytes && - ts_subtree_parse_state(old_tree) != TS_TREE_STATE_NONE && - ts_subtree_parse_state(new_tree) != TS_TREE_STATE_NONE && - (ts_subtree_parse_state(old_tree) == ERROR_STATE) == - (ts_subtree_parse_state(new_tree) == ERROR_STATE)) { - return IteratorMatches; - } else { - return IteratorMayDiffer; - } + return IteratorMayDiffer; } - return IteratorDiffers; + return IteratorMatches; } #ifdef DEBUG_GET_CHANGED_RANGES @@ -348,8 +370,8 @@ static inline void iterator_print_state(Iterator *self) { "(%-25s %s\t depth:%u [%u, %u] - [%u, %u])", name, self->in_padding ? "(p)" : " ", self->visible_depth, - start.row + 1, start.column, - end.row + 1, end.column + start.row, start.column, + end.row, end.column ); } #endif @@ -380,7 +402,7 @@ unsigned ts_subtree_get_changed_ranges( do { #ifdef DEBUG_GET_CHANGED_RANGES - printf("At [%-2u, %-2u] Compare ", position.extent.row + 1, position.extent.column); + printf("At [%-2u, %-2u] Compare ", position.extent.row, position.extent.column); iterator_print_state(&old_iter); printf("\tvs\t"); iterator_print_state(&new_iter); @@ -475,9 +497,9 @@ unsigned ts_subtree_get_changed_ranges( // Keep track of the current position in the included range differences // array in order to avoid scanning the entire array on each iteration. while (included_range_difference_index < included_range_differences->size) { - const TSRange *range = &included_range_differences->contents[ + const TSRange *range = array_get(included_range_differences, included_range_difference_index - ]; + ); if (range->end_byte <= position.bytes) { included_range_difference_index++; } else { diff --git a/language.c b/language.c index 84b15c01..43535528 100644 --- a/language.c +++ b/language.c @@ -24,8 +24,45 @@ uint32_t ts_language_state_count(const TSLanguage *self) { return self->state_count; } +const TSSymbol *ts_language_supertypes(const TSLanguage *self, uint32_t *length) { + if (self->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS) { + *length = self->supertype_count; + return self->supertype_symbols; + } else { + *length = 0; + return NULL; + } +} + +const TSSymbol *ts_language_subtypes( + const TSLanguage *self, + TSSymbol supertype, + uint32_t *length +) { + if (self->abi_version < LANGUAGE_VERSION_WITH_RESERVED_WORDS || !ts_language_symbol_metadata(self, supertype).supertype) { + *length = 0; + return NULL; + } + + TSMapSlice slice = self->supertype_map_slices[supertype]; + *length = slice.length; + return &self->supertype_map_entries[slice.index]; +} + uint32_t ts_language_version(const TSLanguage *self) { - return self->version; + return self->abi_version; +} + +uint32_t ts_language_abi_version(const TSLanguage *self) { + return self->abi_version; +} + +const TSLanguageMetadata *ts_language_metadata(const TSLanguage *self) { + return self->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS ? &self->metadata : NULL; +} + +const char *ts_language_name(const TSLanguage *self) { + return self->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS ? self->name : NULL; } uint32_t ts_language_field_count(const TSLanguage *self) { @@ -43,7 +80,7 @@ void ts_language_table_entry( result->is_reusable = false; result->actions = NULL; } else { - assert(symbol < self->token_count); + ts_assert(symbol < self->token_count); uint32_t action_index = ts_language_lookup(self, state, symbol); const TSParseActionEntry *entry = &self->parse_actions[action_index]; result->action_count = entry->entry.count; @@ -52,6 +89,39 @@ void ts_language_table_entry( } } +TSLexerMode ts_language_lex_mode_for_state( + const TSLanguage *self, + TSStateId state +) { + if (self->abi_version < 15) { + TSLexMode mode = ((const TSLexMode *)self->lex_modes)[state]; + return (TSLexerMode) { + .lex_state = mode.lex_state, + .external_lex_state = mode.external_lex_state, + .reserved_word_set_id = 0, + }; + } else { + return self->lex_modes[state]; + } +} + +bool ts_language_is_reserved_word( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { + TSLexerMode lex_mode = ts_language_lex_mode_for_state(self, state); + if (lex_mode.reserved_word_set_id > 0) { + unsigned start = lex_mode.reserved_word_set_id * self->max_reserved_word_set_size; + unsigned end = start + self->max_reserved_word_set_size; + for (unsigned i = start; i < end; i++) { + if (self->reserved_words[i] == symbol) return true; + if (self->reserved_words[i] == 0) break; + } + } + return false; +} + TSSymbolMetadata ts_language_symbol_metadata( const TSLanguage *self, TSSymbol symbol @@ -116,7 +186,7 @@ TSSymbol ts_language_symbol_for_name( uint32_t length, bool is_named ) { - if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error; + if (is_named && !strncmp(string, "ERROR", length)) return ts_builtin_sym_error; uint16_t count = (uint16_t)ts_language_symbol_count(self); for (TSSymbol i = 0; i < count; i++) { TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i); @@ -138,6 +208,8 @@ TSSymbolType ts_language_symbol_type( return TSSymbolTypeRegular; } else if (metadata.visible) { return TSSymbolTypeAnonymous; + } else if (metadata.supertype) { + return TSSymbolTypeSupertype; } else { return TSSymbolTypeAuxiliary; } diff --git a/language.h b/language.h index 4e2769b4..518c06bf 100644 --- a/language.h +++ b/language.h @@ -10,8 +10,8 @@ extern "C" { #define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) +#define LANGUAGE_VERSION_WITH_RESERVED_WORDS 15 #define LANGUAGE_VERSION_WITH_PRIMARY_STATES 14 -#define LANGUAGE_VERSION_USABLE_VIA_WASM 13 typedef struct { const TSParseAction *actions; @@ -35,17 +35,11 @@ typedef struct { uint16_t action_count; } LookaheadIterator; -void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *); - -TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); - -TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol); - -TSStateId ts_language_next_state(const TSLanguage *self, TSStateId state, TSSymbol symbol); - -static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { - return 0 < symbol && symbol < self->external_token_count + 1; -} +void ts_language_table_entry(const TSLanguage *self, TSStateId state, TSSymbol symbol, TableEntry *result); +TSLexerMode ts_language_lex_mode_for_state(const TSLanguage *self, TSStateId state); +bool ts_language_is_reserved_word(const TSLanguage *self, TSStateId state, TSSymbol symbol); +TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *self, TSSymbol symbol); +TSSymbol ts_language_public_symbol(const TSLanguage *self, TSSymbol symbol); static inline const TSParseAction *ts_language_actions( const TSLanguage *self, @@ -189,7 +183,7 @@ static inline bool ts_language_state_is_primary( const TSLanguage *self, TSStateId state ) { - if (self->version >= LANGUAGE_VERSION_WITH_PRIMARY_STATES) { + if (self->abi_version >= LANGUAGE_VERSION_WITH_PRIMARY_STATES) { return state == self->primary_state_ids[state]; } else { return true; @@ -238,7 +232,7 @@ static inline void ts_language_field_map( return; } - TSFieldMapSlice slice = self->field_map_slices[production_id]; + TSMapSlice slice = self->field_map_slices[production_id]; *start = &self->field_map_entries[slice.index]; *end = &self->field_map_entries[slice.index] + slice.length; } diff --git a/length.h b/length.h index dbae5ced..be3a46ad 100644 --- a/length.h +++ b/length.h @@ -31,7 +31,7 @@ static inline Length length_add(Length len1, Length len2) { static inline Length length_sub(Length len1, Length len2) { Length result; - result.bytes = len1.bytes - len2.bytes; + result.bytes = (len1.bytes >= len2.bytes) ? len1.bytes - len2.bytes : 0; result.extent = point_sub(len1.extent, len2.extent); return result; } diff --git a/lexer.c b/lexer.c index d108c04e..dd10b429 100644 --- a/lexer.c +++ b/lexer.c @@ -1,9 +1,12 @@ -#include -#include "./lexer.h" -#include "./subtree.h" #include "./length.h" +#include "./lexer.h" #include "./unicode.h" +#include "api.h" + +#include +#include + #define LOG(message, character) \ if (self->logger.log) { \ snprintf( \ @@ -36,6 +39,35 @@ static const TSRange DEFAULT_RANGE = { .end_byte = UINT32_MAX }; +/** + * Sets the column data to the given value and marks it valid. + * @param self The lexer state. + * @param val The new value of the column data. + */ +static void ts_lexer__set_column_data(Lexer *self, uint32_t val) { + self->column_data.valid = true; + self->column_data.value = val; +} + +/** + * Increments the value of the column data; no-op if invalid. + * @param self The lexer state. + */ +static void ts_lexer__increment_column_data(Lexer *self) { + if (self->column_data.valid) { + self->column_data.value++; + } +} + +/** + * Marks the column data as invalid. + * @param self The lexer state. + */ +static void ts_lexer__invalidate_column_data(Lexer *self) { + self->column_data.valid = false; + self->column_data.value = 0; +} + // Check if the lexer has reached EOF. This state is stored // by setting the lexer's `current_included_range_index` such that // it has consumed all of its available ranges. @@ -82,9 +114,10 @@ static void ts_lexer__get_lookahead(Lexer *self) { } const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; - UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 - ? ts_decode_utf8 - : ts_decode_utf16; + DecodeFunction decode = + self->input.encoding == TSInputEncodingUTF8 ? ts_decode_utf8 : + self->input.encoding == TSInputEncodingUTF16LE ? ts_decode_utf16_le : + self->input.encoding == TSInputEncodingUTF16BE ? ts_decode_utf16_be : self->input.decode; self->lookahead_size = decode(chunk, size, &self->data.lookahead); @@ -103,6 +136,10 @@ static void ts_lexer__get_lookahead(Lexer *self) { } static void ts_lexer_goto(Lexer *self, Length position) { + if (position.bytes != self->current_position.bytes) { + ts_lexer__invalidate_column_data(self); + } + self->current_position = position; // Move to the first valid position at or after the given position. @@ -155,16 +192,24 @@ static void ts_lexer_goto(Lexer *self, Length position) { } } -// Intended to be called only from functions that control logging. +/** + * Actually advances the lexer. Does not log anything. + * @param self The lexer state. + * @param skip Whether to mark the consumed codepoint as whitespace. + */ static void ts_lexer__do_advance(Lexer *self, bool skip) { if (self->lookahead_size) { - self->current_position.bytes += self->lookahead_size; if (self->data.lookahead == '\n') { self->current_position.extent.row++; self->current_position.extent.column = 0; + ts_lexer__set_column_data(self, 0); } else { + bool is_bom = self->current_position.bytes == 0 && + self->data.lookahead == BYTE_ORDER_MARK; + if (!is_bom) ts_lexer__increment_column_data(self); self->current_position.extent.column += self->lookahead_size; } + self->current_position.bytes += self->lookahead_size; } const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; @@ -248,27 +293,33 @@ static void ts_lexer__mark_end(TSLexer *_self) { static uint32_t ts_lexer__get_column(TSLexer *_self) { Lexer *self = (Lexer *)_self; - uint32_t goal_byte = self->current_position.bytes; - self->did_get_column = true; - self->current_position.bytes -= self->current_position.extent.column; - self->current_position.extent.column = 0; - if (self->current_position.bytes < self->chunk_start) { + if (!self->column_data.valid) { + // Record current position + uint32_t goal_byte = self->current_position.bytes; + + // Back up to the beginning of the line + Length start_of_col = { + self->current_position.bytes - self->current_position.extent.column, + {self->current_position.extent.row, 0}, + }; + ts_lexer_goto(self, start_of_col); + ts_lexer__set_column_data(self, 0); ts_lexer__get_chunk(self); - } - uint32_t result = 0; - if (!ts_lexer__eof(_self)) { - ts_lexer__get_lookahead(self); - while (self->current_position.bytes < goal_byte && self->chunk) { - result++; - ts_lexer__do_advance(self, false); - if (ts_lexer__eof(_self)) break; + if (!ts_lexer__eof(_self)) { + ts_lexer__get_lookahead(self); + + // Advance to the recorded position + while (self->current_position.bytes < goal_byte && !ts_lexer__eof(_self) && self->chunk) { + ts_lexer__do_advance(self, false); + if (ts_lexer__eof(_self)) break; + } } } - return result; + return self->column_data.value; } // Is the lexer at a boundary between two disjoint included ranges of @@ -284,6 +335,17 @@ static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) { } } +static void ts_lexer__log(const TSLexer *_self, const char *fmt, ...) { + Lexer *self = (Lexer *)_self; + va_list args; + va_start(args, fmt); + if (self->logger.log) { + vsnprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, fmt, args); + self->logger.log(self->logger.payload, TSLogTypeLex, self->debug_buffer); + } + va_end(args); +} + void ts_lexer_init(Lexer *self) { *self = (Lexer) { .data = { @@ -295,6 +357,7 @@ void ts_lexer_init(Lexer *self) { .get_column = ts_lexer__get_column, .is_at_included_range_start = ts_lexer__is_at_included_range_start, .eof = ts_lexer__eof, + .log = ts_lexer__log, .lookahead = 0, .result_symbol = 0, }, @@ -309,6 +372,11 @@ void ts_lexer_init(Lexer *self) { .included_ranges = NULL, .included_range_count = 0, .current_included_range_index = 0, + .did_get_column = false, + .column_data = { + .valid = false, + .value = 0 + } }; ts_lexer_set_included_ranges(self, NULL, 0); } @@ -339,10 +407,12 @@ void ts_lexer_start(Lexer *self) { if (!ts_lexer__eof(&self->data)) { if (!self->chunk_size) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); - if ( - self->current_position.bytes == 0 && - self->data.lookahead == BYTE_ORDER_MARK - ) ts_lexer__advance(&self->data, true); + if (self->current_position.bytes == 0) { + if (self->data.lookahead == BYTE_ORDER_MARK) { + ts_lexer__advance(&self->data, true); + } + ts_lexer__set_column_data(self, 0); + } } } @@ -365,7 +435,7 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { // Therefore, the next byte *after* the current (invalid) character // affects the interpretation of the current character. if (self->data.lookahead == TS_DECODE_ERROR) { - current_lookahead_end_byte++; + current_lookahead_end_byte += 4; // the maximum number of bytes read to identify an invalid code point } if (current_lookahead_end_byte > *lookahead_end_byte) { @@ -373,12 +443,6 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { } } -void ts_lexer_advance_to_end(Lexer *self) { - while (self->chunk) { - ts_lexer__advance(&self->data, false); - } -} - void ts_lexer_mark_end(Lexer *self) { ts_lexer__mark_end(&self->data); } diff --git a/lexer.h b/lexer.h index a8cc38f1..d61dee38 100644 --- a/lexer.h +++ b/lexer.h @@ -10,6 +10,11 @@ extern "C" { #include "api.h" #include "./parser.h" +typedef struct { + uint32_t value; + bool valid; +} ColumnData; + typedef struct { TSLexer data; Length current_position; @@ -27,18 +32,18 @@ typedef struct { uint32_t chunk_size; uint32_t lookahead_size; bool did_get_column; + ColumnData column_data; char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; } Lexer; -void ts_lexer_init(Lexer *); -void ts_lexer_delete(Lexer *); -void ts_lexer_set_input(Lexer *, TSInput); -void ts_lexer_reset(Lexer *, Length); -void ts_lexer_start(Lexer *); -void ts_lexer_finish(Lexer *, uint32_t *); -void ts_lexer_advance_to_end(Lexer *); -void ts_lexer_mark_end(Lexer *); +void ts_lexer_init(Lexer *self); +void ts_lexer_delete(Lexer *self); +void ts_lexer_set_input(Lexer *self, TSInput input); +void ts_lexer_reset(Lexer *self, Length position); +void ts_lexer_start(Lexer *self); +void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte); +void ts_lexer_mark_end(Lexer *self); bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count); TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count); diff --git a/node.c b/node.c index f9960213..d83fa90b 100644 --- a/node.c +++ b/node.c @@ -1,4 +1,5 @@ #include +#include "./point.h" #include "./subtree.h" #include "./tree.h" #include "./language.h" @@ -12,6 +13,8 @@ typedef struct { const TSSymbol *alias_sequence; } NodeChildIterator; +static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous); + // TSNode - constructors TSNode ts_node_new( @@ -260,8 +263,16 @@ static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); while (ts_node_child_iterator_next(&iterator, &child)) { - if (iterator.position.bytes < target_end_byte) continue; - if (ts_node_start_byte(child) <= ts_node_start_byte(self)) { + if (iterator.position.bytes <= target_end_byte) continue; + uint32_t start_byte = ts_node_start_byte(self); + uint32_t child_start_byte = ts_node_start_byte(child); + + bool is_empty = start_byte == target_end_byte; + bool contains_target = is_empty ? + child_start_byte < start_byte : + child_start_byte <= start_byte; + + if (contains_target) { if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) { child_containing_target = child; } @@ -304,22 +315,36 @@ static inline TSNode ts_node__first_child_for_byte( TSNode node = self; bool did_descend = true; + NodeChildIterator last_iterator; + bool has_last_iterator = false; + while (did_descend) { did_descend = false; TSNode child; NodeChildIterator iterator = ts_node_iterate_children(&node); + loop: while (ts_node_child_iterator_next(&iterator, &child)) { if (ts_node_end_byte(child) > goal) { if (ts_node__is_relevant(child, include_anonymous)) { return child; } else if (ts_node_child_count(child) > 0) { + if (iterator.child_index < ts_subtree_child_count(ts_node__subtree(child))) { + last_iterator = iterator; + has_last_iterator = true; + } did_descend = true; node = child; break; } } } + + if (!did_descend && has_last_iterator) { + iterator = last_iterator; + has_last_iterator = false; + goto loop; + } } return ts_node__null(); @@ -331,6 +356,9 @@ static inline TSNode ts_node__descendant_for_byte_range( uint32_t range_end, bool include_anonymous ) { + if (range_start > range_end) { + return ts_node__null(); + } TSNode node = self; TSNode last_visible_node = self; @@ -344,9 +372,13 @@ static inline TSNode ts_node__descendant_for_byte_range( uint32_t node_end = iterator.position.bytes; // The end of this node must extend far enough forward to touch - // the end of the range and exceed the start of the range. + // the end of the range if (node_end < range_end) continue; - if (node_end <= range_start) continue; + + // ...and exceed the start of the range, unless the node itself is + // empty, in which case it must at least be equal to the start of the range. + bool is_empty = ts_node_start_byte(child) == node_end; + if (is_empty ? node_end < range_start : node_end <= range_start) continue; // The start of this node must extend far enough backward to // touch the start of the range. @@ -370,6 +402,9 @@ static inline TSNode ts_node__descendant_for_point_range( TSPoint range_end, bool include_anonymous ) { + if (point_gt(range_start, range_end)) { + return ts_node__null(); + } TSNode node = self; TSNode last_visible_node = self; @@ -383,9 +418,15 @@ static inline TSNode ts_node__descendant_for_point_range( TSPoint node_end = iterator.position.extent; // The end of this node must extend far enough forward to touch - // the end of the range and exceed the start of the range. + // the end of the range if (point_lt(node_end, range_end)) continue; - if (point_lte(node_end, range_start)) continue; + + // ...and exceed the start of the range, unless the node itself is + // empty, in which case it must at least be equal to the start of the range. + bool is_empty = point_eq(ts_node_start_point(child), node_end); + if (is_empty ? point_lt(node_end, range_start) : point_lte(node_end, range_start)) { + continue; + } // The start of this node must extend far enough backward to // touch the start of the range. @@ -505,33 +546,48 @@ TSStateId ts_node_next_parse_state(TSNode self) { TSNode ts_node_parent(TSNode self) { TSNode node = ts_tree_root_node(self.tree); - uint32_t end_byte = ts_node_end_byte(self); if (node.id == self.id) return ts_node__null(); - TSNode last_visible_node = node; - bool did_descend = true; - while (did_descend) { - did_descend = false; + while (true) { + TSNode next_node = ts_node_child_with_descendant(node, self); + if (next_node.id == self.id || ts_node_is_null(next_node)) break; + node = next_node; + } - TSNode child; - NodeChildIterator iterator = ts_node_iterate_children(&node); - while (ts_node_child_iterator_next(&iterator, &child)) { + return node; +} + +TSNode ts_node_child_with_descendant(TSNode self, TSNode descendant) { + uint32_t start_byte = ts_node_start_byte(descendant); + uint32_t end_byte = ts_node_end_byte(descendant); + bool is_empty = start_byte == end_byte; + + do { + NodeChildIterator iter = ts_node_iterate_children(&self); + do { if ( - ts_node_start_byte(child) > ts_node_start_byte(self) || - child.id == self.id - ) break; - if (iterator.position.bytes >= end_byte && ts_node_child_count(child) > 0) { - node = child; - if (ts_node__is_relevant(child, true)) { - last_visible_node = node; + !ts_node_child_iterator_next(&iter, &self) + || ts_node_start_byte(self) > start_byte + ) { + return ts_node__null(); + } + if (self.id == descendant.id) { + return self; + } + + // If the descendant is empty, and the end byte is within `self`, + // we check whether `self` contains it or not. + if (is_empty && iter.position.bytes >= end_byte && ts_node_child_count(self) > 0) { + TSNode child = ts_node_child_with_descendant(self, descendant); + // If the child is not null, return self if it's relevant, else return the child + if (!ts_node_is_null(child)) { + return ts_node__is_relevant(self, true) ? self : child; } - did_descend = true; - break; } - } - } + } while ((is_empty ? iter.position.bytes <= end_byte : iter.position.bytes < end_byte) || ts_node_child_count(self) == 0); + } while (!ts_node__is_relevant(self, true)); - return last_visible_node; + return self; } TSNode ts_node_child(TSNode self, uint32_t child_index) { @@ -644,6 +700,9 @@ const char *ts_node_field_name_for_child(TSNode self, uint32_t child_index) { while (ts_node_child_iterator_next(&iterator, &child)) { if (ts_node__is_relevant(child, true)) { if (index == child_index) { + if (ts_node_is_extra(child)) { + return NULL; + } const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1); if (field_name) return field_name; return inherited_field_name; @@ -669,6 +728,48 @@ const char *ts_node_field_name_for_child(TSNode self, uint32_t child_index) { return NULL; } +const char *ts_node_field_name_for_named_child(TSNode self, uint32_t named_child_index) { + TSNode result = self; + bool did_descend = true; + const char *inherited_field_name = NULL; + + while (did_descend) { + did_descend = false; + + TSNode child; + uint32_t index = 0; + NodeChildIterator iterator = ts_node_iterate_children(&result); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (ts_node__is_relevant(child, false)) { + if (index == named_child_index) { + if (ts_node_is_extra(child)) { + return NULL; + } + const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1); + if (field_name) return field_name; + return inherited_field_name; + } + index++; + } else { + uint32_t named_grandchild_index = named_child_index - index; + uint32_t grandchild_count = ts_node__relevant_child_count(child, false); + if (named_grandchild_index < grandchild_count) { + const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1); + if (field_name) inherited_field_name = field_name; + + did_descend = true; + result = child; + named_child_index = named_grandchild_index; + break; + } + index += grandchild_count; + } + } + } + + return NULL; +} + TSNode ts_node_child_by_field_name( TSNode self, const char *name, diff --git a/parser.c b/parser.c index a72983dd..3d3891a8 100644 --- a/parser.c +++ b/parser.c @@ -1,7 +1,4 @@ -#define _POSIX_C_SOURCE 200112L - #include -#include #include #include #include @@ -21,6 +18,7 @@ #include "./stack.h" #include "./subtree.h" #include "./tree.h" +#include "./ts_assert.h" #include "./wasm_store.h" #define LOG(...) \ @@ -33,7 +31,11 @@ if (self->lexer.logger.log || self->dot_graph_file) { \ char *buf = self->lexer.debug_buffer; \ const char *symbol = symbol_name; \ - int off = sprintf(buf, "lexed_lookahead sym:"); \ + int off = snprintf( \ + buf, \ + TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \ + "lexed_lookahead sym:" \ + ); \ for ( \ int i = 0; \ symbol[i] != '\0' \ @@ -78,8 +80,8 @@ static const unsigned MAX_VERSION_COUNT = 6; static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4; static const unsigned MAX_SUMMARY_DEPTH = 16; -static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; -static const unsigned OP_COUNT_PER_TIMEOUT_CHECK = 100; +static const unsigned MAX_COST_DIFFERENCE = 18 * ERROR_COST_PER_SKIPPED_TREE; +static const unsigned OP_COUNT_PER_PARSER_TIMEOUT_CHECK = 100; typedef struct { Subtree token; @@ -109,8 +111,12 @@ struct TSParser { const volatile size_t *cancellation_flag; Subtree old_tree; TSRangeArray included_range_differences; + TSParseOptions parse_options; + TSParseState parse_state; unsigned included_range_difference_index; bool has_scanner_error; + bool canceled_balancing; + bool has_error; }; typedef struct { @@ -187,7 +193,7 @@ static bool ts_parser__breakdown_top_of_stack( did_break_down = true; pending = false; for (uint32_t i = 0; i < pop.size; i++) { - StackSlice slice = pop.contents[i]; + StackSlice slice = *array_get(&pop, i); TSStateId state = ts_stack_state(self->stack, slice.version); Subtree parent = *array_front(&slice.subtrees); @@ -206,7 +212,7 @@ static bool ts_parser__breakdown_top_of_stack( } for (uint32_t j = 1; j < slice.subtrees.size; j++) { - Subtree tree = slice.subtrees.contents[j]; + Subtree tree = *array_get(&slice.subtrees, j); ts_stack_push(self->stack, slice.version, tree, false, state); } @@ -338,7 +344,7 @@ static bool ts_parser__better_version_exists( return false; } -static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) { +static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexerMode lex_mode) { if (ts_language_is_wasm(self->language)) { return ts_wasm_store_call_lex_main(self->wasm_store, lex_mode.lex_state); } else { @@ -346,7 +352,7 @@ static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) { } } -static bool ts_parser__call_keyword_lex_fn(TSParser *self, TSLexMode lex_mode) { +static bool ts_parser__call_keyword_lex_fn(TSParser *self) { if (ts_language_is_wasm(self->language)) { return ts_wasm_store_call_lex_keyword(self->wasm_store, 0); } else { @@ -397,10 +403,12 @@ static unsigned ts_parser__external_scanner_serialize( self->lexer.debug_buffer ); } else { - return self->language->external_scanner.serialize( + uint32_t length = self->language->external_scanner.serialize( self->external_scanner_payload, self->lexer.debug_buffer ); + ts_assert(length <= TREE_SITTER_SERIALIZATION_BUFFER_SIZE); + return length; } } @@ -467,10 +475,10 @@ static bool ts_parser__can_reuse_first_leaf( Subtree tree, TableEntry *table_entry ) { - TSLexMode current_lex_mode = self->language->lex_modes[state]; TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree); TSStateId leaf_state = ts_subtree_leaf_parse_state(tree); - TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state]; + TSLexerMode current_lex_mode = ts_language_lex_mode_for_state(self->language, state); + TSLexerMode leaf_lex_mode = ts_language_lex_mode_for_state(self->language, leaf_state); // At the end of a non-terminal extra node, the lexer normally returns // NULL, which indicates that the parser should look for a reduce action @@ -481,7 +489,7 @@ static bool ts_parser__can_reuse_first_leaf( // If the token was created in a state with the same set of lookaheads, it is reusable. if ( table_entry->action_count > 0 && - memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexMode)) == 0 && + memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexerMode)) == 0 && ( leaf_symbol != self->language->keyword_capture_token || (!ts_subtree_is_keyword(tree) && ts_subtree_parse_state(tree) == state) @@ -501,7 +509,7 @@ static Subtree ts_parser__lex( StackVersion version, TSStateId parse_state ) { - TSLexMode lex_mode = self->language->lex_modes[parse_state]; + TSLexerMode lex_mode = ts_language_lex_mode_for_state(self->language, parse_state); if (lex_mode.lex_state == (uint16_t)-1) { LOG("no_lookahead_after_non_terminal_extra"); return NULL_SUBTREE; @@ -525,6 +533,7 @@ static Subtree ts_parser__lex( for (;;) { bool found_token = false; Length current_position = self->lexer.current_position; + ColumnData column_data = self->lexer.column_data; if (lex_mode.external_lex_state != 0) { LOG( @@ -547,27 +556,29 @@ static Subtree ts_parser__lex( external_scanner_state_len ); - // When recovering from an error, ignore any zero-length external tokens - // unless they have changed the external scanner's state. This helps to - // avoid infinite loops which could otherwise occur, because the lexer is - // looking for any possible token, instead of looking for the specific set of - // tokens that are valid in some parse state. + // Avoid infinite loops caused by the external scanner returning empty tokens. + // Empty tokens are needed in some circumstances, e.g. indent/dedent tokens + // in Python. Ignore the following classes of empty tokens: // - // Note that it's possible that the token end position may be *before* the - // original position of the lexer because of the way that tokens are positioned - // at included range boundaries: when a token is terminated at the start of - // an included range, it is marked as ending at the *end* of the preceding - // included range. + // * Tokens produced during error recovery. When recovering from an error, + // all tokens are allowed, so it's easy to accidentally return unwanted + // empty tokens. + // * Tokens that are marked as 'extra' in the grammar. These don't change + // the parse state, so they would definitely cause an infinite loop. if ( self->lexer.token_end_position.bytes <= current_position.bytes && - (error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) && !external_scanner_state_changed ) { - LOG( - "ignore_empty_external_token symbol:%s", - SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]) - ) - found_token = false; + TSSymbol symbol = self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]; + TSStateId next_parse_state = ts_language_next_state(self->language, parse_state, symbol); + bool token_is_extra = (next_parse_state == parse_state); + if (error_mode || !ts_stack_has_advanced_since_error(self->stack, version) || token_is_extra) { + LOG( + "ignore_empty_external_token symbol:%s", + SYM_NAME(self->language->external_scanner.symbol_map[self->lexer.data.result_symbol]) + ); + found_token = false; + } } } @@ -578,6 +589,7 @@ static Subtree ts_parser__lex( } ts_lexer_reset(&self->lexer, current_position); + self->lexer.column_data = column_data; } LOG( @@ -593,7 +605,7 @@ static Subtree ts_parser__lex( if (!error_mode) { error_mode = true; - lex_mode = self->language->lex_modes[ERROR_STATE]; + lex_mode = ts_language_lex_mode_for_state(self->language, ERROR_STATE); ts_lexer_reset(&self->lexer, start_position); continue; } @@ -645,12 +657,15 @@ static Subtree ts_parser__lex( ts_lexer_reset(&self->lexer, self->lexer.token_start_position); ts_lexer_start(&self->lexer); - is_keyword = ts_parser__call_keyword_lex_fn(self, lex_mode); + is_keyword = ts_parser__call_keyword_lex_fn(self); if ( is_keyword && self->lexer.token_end_position.bytes == end_byte && - ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) + ( + ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol) || + ts_language_is_reserved_word(self->language, parse_state, self->lexer.data.result_symbol) + ) ) { symbol = self->lexer.data.result_symbol; } @@ -934,20 +949,22 @@ static StackVersion ts_parser__reduce( // children. StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); uint32_t removed_version_count = 0; + uint32_t halted_version_count = ts_stack_halted_version_count(self->stack); for (uint32_t i = 0; i < pop.size; i++) { - StackSlice slice = pop.contents[i]; + StackSlice slice = *array_get(&pop, i); StackVersion slice_version = slice.version - removed_version_count; // This is where new versions are added to the parse stack. The versions // will all be sorted and truncated at the end of the outer parsing loop. // Allow the maximum version count to be temporarily exceeded, but only // by a limited threshold. - if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) { + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW + halted_version_count) { ts_stack_remove_version(self->stack, slice_version); ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); removed_version_count++; while (i + 1 < pop.size) { - StackSlice next_slice = pop.contents[i + 1]; + LOG("aborting reduce with too many versions") + StackSlice next_slice = *array_get(&pop, i + 1); if (next_slice.version != slice.version) break; ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); i++; @@ -970,7 +987,7 @@ static StackVersion ts_parser__reduce( // choose one of the arrays of trees to be the parent node's children, and // delete the rest of the tree arrays. while (i + 1 < pop.size) { - StackSlice next_slice = pop.contents[i + 1]; + StackSlice next_slice = *array_get(&pop, i + 1); if (next_slice.version != slice.version) break; i++; @@ -1012,7 +1029,7 @@ static StackVersion ts_parser__reduce( // were previously on top of the stack. ts_stack_push(self->stack, slice_version, ts_subtree_from_mut(parent), false, next_state); for (uint32_t j = 0; j < self->trailing_extras.size; j++) { - ts_stack_push(self->stack, slice_version, self->trailing_extras.contents[j], false, next_state); + ts_stack_push(self->stack, slice_version, *array_get(&self->trailing_extras, j), false, next_state); } for (StackVersion j = 0; j < slice_version; j++) { @@ -1035,18 +1052,18 @@ static void ts_parser__accept( StackVersion version, Subtree lookahead ) { - assert(ts_subtree_is_eof(lookahead)); + ts_assert(ts_subtree_is_eof(lookahead)); ts_stack_push(self->stack, version, lookahead, false, 1); StackSliceArray pop = ts_stack_pop_all(self->stack, version); for (uint32_t i = 0; i < pop.size; i++) { - SubtreeArray trees = pop.contents[i].subtrees; + SubtreeArray trees = array_get(&pop, i)->subtrees; Subtree root = NULL_SUBTREE; for (uint32_t j = trees.size - 1; j + 1 > 0; j--) { - Subtree tree = trees.contents[j]; + Subtree tree = *array_get(&trees, j); if (!ts_subtree_extra(tree)) { - assert(!tree.data.is_inline); + ts_assert(!tree.data.is_inline); uint32_t child_count = ts_subtree_child_count(tree); const Subtree *children = ts_subtree_children(tree); for (uint32_t k = 0; k < child_count; k++) { @@ -1064,7 +1081,7 @@ static void ts_parser__accept( } } - assert(root.ptr); + ts_assert(root.ptr); self->accept_count++; if (self->finished_tree.ptr) { @@ -1079,7 +1096,7 @@ static void ts_parser__accept( } } - ts_stack_remove_version(self->stack, pop.contents[0].version); + ts_stack_remove_version(self->stack, array_get(&pop, 0)->version); ts_stack_halt(self->stack, version); } @@ -1145,7 +1162,7 @@ static bool ts_parser__do_all_potential_reductions( StackVersion reduction_version = STACK_VERSION_NONE; for (uint32_t j = 0; j < self->reduce_actions.size; j++) { - ReduceAction action = self->reduce_actions.contents[j]; + ReduceAction action = *array_get(&self->reduce_actions, j); reduction_version = ts_parser__reduce( self, version, action.symbol, action.count, @@ -1183,7 +1200,7 @@ static bool ts_parser__recover_to_state( StackVersion previous_version = STACK_VERSION_NONE; for (unsigned i = 0; i < pop.size; i++) { - StackSlice slice = pop.contents[i]; + StackSlice slice = *array_get(&pop, i); if (slice.version == previous_version) { ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); @@ -1200,13 +1217,13 @@ static bool ts_parser__recover_to_state( SubtreeArray error_trees = ts_stack_pop_error(self->stack, slice.version); if (error_trees.size > 0) { - assert(error_trees.size == 1); - Subtree error_tree = error_trees.contents[0]; + ts_assert(error_trees.size == 1); + Subtree error_tree = *array_get(&error_trees, 0); uint32_t error_child_count = ts_subtree_child_count(error_tree); if (error_child_count > 0) { array_splice(&slice.subtrees, 0, 0, error_child_count, ts_subtree_children(error_tree)); for (unsigned j = 0; j < error_child_count; j++) { - ts_subtree_retain(slice.subtrees.contents[j]); + ts_subtree_retain(*array_get(&slice.subtrees, j)); } } ts_subtree_array_delete(&self->tree_pool, &error_trees); @@ -1222,7 +1239,7 @@ static bool ts_parser__recover_to_state( } for (unsigned j = 0; j < self->trailing_extras.size; j++) { - Subtree tree = self->trailing_extras.contents[j]; + Subtree tree = *array_get(&self->trailing_extras, j); ts_stack_push(self->stack, slice.version, tree, false, goal_state); } @@ -1258,7 +1275,7 @@ static void ts_parser__recover( // if the current lookahead token would be valid in that state. if (summary && !ts_subtree_is_error(lookahead)) { for (unsigned i = 0; i < summary->size; i++) { - StackSummaryEntry entry = summary->contents[i]; + StackSummaryEntry entry = *array_get(summary, i); if (entry.state == ERROR_STATE) continue; if (entry.position.bytes == position.bytes) continue; @@ -1303,10 +1320,23 @@ static void ts_parser__recover( // and subsequently halted. Remove those versions. for (unsigned i = previous_version_count; i < ts_stack_version_count(self->stack); i++) { if (!ts_stack_is_active(self->stack, i)) { + LOG("removed paused version:%u", i); ts_stack_remove_version(self->stack, i--); + LOG_STACK(); } } + // If the parser is still in the error state at the end of the file, just wrap everything + // in an ERROR node and terminate. + if (ts_subtree_is_eof(lookahead)) { + LOG("recover_eof"); + SubtreeArray children = array_new(); + Subtree parent = ts_subtree_new_error_node(&children, false, self->language); + ts_stack_push(self->stack, version, parent, false, 1); + ts_parser__accept(self, version, lookahead); + return; + } + // If strategy 1 succeeded, a new stack version will have been created which is able to handle // the current lookahead token. Now, in addition, try strategy 2 described above: skip the // current lookahead token by wrapping it in an ERROR node. @@ -1327,17 +1357,6 @@ static void ts_parser__recover( return; } - // If the parser is still in the error state at the end of the file, just wrap everything - // in an ERROR node and terminate. - if (ts_subtree_is_eof(lookahead)) { - LOG("recover_eof"); - SubtreeArray children = array_new(); - Subtree parent = ts_subtree_new_error_node(&children, false, self->language); - ts_stack_push(self->stack, version, parent, false, 1); - ts_parser__accept(self, version, lookahead); - return; - } - // Do not recover if the result would clearly be worse than some existing stack version. unsigned new_cost = current_error_cost + ERROR_COST_PER_SKIPPED_TREE + @@ -1383,18 +1402,18 @@ static void ts_parser__recover( // arbitrarily and discard the rest. if (pop.size > 1) { for (unsigned i = 1; i < pop.size; i++) { - ts_subtree_array_delete(&self->tree_pool, &pop.contents[i].subtrees); + ts_subtree_array_delete(&self->tree_pool, &array_get(&pop, i)->subtrees); } - while (ts_stack_version_count(self->stack) > pop.contents[0].version + 1) { - ts_stack_remove_version(self->stack, pop.contents[0].version + 1); + while (ts_stack_version_count(self->stack) > array_get(&pop, 0)->version + 1) { + ts_stack_remove_version(self->stack, array_get(&pop, 0)->version + 1); } } - ts_stack_renumber_version(self->stack, pop.contents[0].version, version); - array_push(&pop.contents[0].subtrees, ts_subtree_from_mut(error_repeat)); + ts_stack_renumber_version(self->stack, array_get(&pop, 0)->version, version); + array_push(&array_get(&pop, 0)->subtrees, ts_subtree_from_mut(error_repeat)); error_repeat = ts_subtree_new_node( ts_builtin_sym_error_repeat, - &pop.contents[0].subtrees, + &array_get(&pop, 0)->subtrees, 0, self->language ); @@ -1407,6 +1426,16 @@ static void ts_parser__recover( self->stack, version, ts_subtree_last_external_token(lookahead) ); } + + bool has_error = true; + for (unsigned i = 0; i < ts_stack_version_count(self->stack); i++) { + ErrorStatus status = ts_parser__version_status(self, i); + if (!status.is_in_error) { + has_error = false; + break; + } + } + self->has_error = has_error; } static void ts_parser__handle_error( @@ -1488,8 +1517,7 @@ static void ts_parser__handle_error( for (unsigned i = previous_version_count; i < version_count; i++) { bool did_merge = ts_stack_merge(self->stack, version, previous_version_count); - assert(did_merge); - (void)did_merge; // fix warning/error with clang -Os + ts_assert(did_merge); } ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); @@ -1507,6 +1535,32 @@ static void ts_parser__handle_error( LOG_STACK(); } +static bool ts_parser__check_progress(TSParser *self, Subtree *lookahead, const uint32_t *position, unsigned operations) { + self->operation_count += operations; + if (self->operation_count >= OP_COUNT_PER_PARSER_TIMEOUT_CHECK) { + self->operation_count = 0; + } + if (position != NULL) { + self->parse_state.current_byte_offset = *position; + self->parse_state.has_error = self->has_error; + } + if ( + self->operation_count == 0 && + ( + // TODO(amaanq): remove cancellation flag & clock checks before 0.26 + (self->cancellation_flag && atomic_load(self->cancellation_flag)) || + (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock)) || + (self->parse_options.progress_callback && self->parse_options.progress_callback(&self->parse_state)) + ) + ) { + if (lookahead && lookahead->ptr) { + ts_subtree_release(&self->tree_pool, *lookahead); + } + return false; + } + return true; +} + static bool ts_parser__advance( TSParser *self, StackVersion version, @@ -1557,19 +1611,9 @@ static bool ts_parser__advance( } } - // If a cancellation flag or a timeout was provided, then check every + // If a cancellation flag, timeout, or progress callback was provided, then check every // time a fixed number of parse actions has been processed. - if (++self->operation_count == OP_COUNT_PER_TIMEOUT_CHECK) { - self->operation_count = 0; - } - if ( - self->operation_count == 0 && - ((self->cancellation_flag && atomic_load(self->cancellation_flag)) || - (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock))) - ) { - if (lookahead.ptr) { - ts_subtree_release(&self->tree_pool, lookahead); - } + if (!ts_parser__check_progress(self, &lookahead, &position, 1)) { return false; } @@ -1578,6 +1622,7 @@ static bool ts_parser__advance( // an ambiguous state. REDUCE actions always create a new stack // version, whereas SHIFT actions update the existing stack version // and terminate this loop. + bool did_reduce = false; StackVersion last_reduction_version = STACK_VERSION_NONE; for (uint32_t i = 0; i < table_entry.action_count; i++) { TSParseAction action = table_entry.actions[i]; @@ -1613,6 +1658,7 @@ static bool ts_parser__advance( action.reduce.dynamic_precedence, action.reduce.production_id, is_fragile, end_of_non_terminal_extra ); + did_reduce = true; if (reduction_version != STACK_VERSION_NONE) { last_reduction_version = reduction_version; } @@ -1664,22 +1710,30 @@ static bool ts_parser__advance( continue; } - // A non-terminal extra rule was reduced and merged into an existing - // stack version. This version can be discarded. - if (!lookahead.ptr) { + // A reduction was performed, but was merged into an existing stack version. + // This version can be discarded. + if (did_reduce) { + if (lookahead.ptr) { + ts_subtree_release(&self->tree_pool, lookahead); + } ts_stack_halt(self->stack, version); return true; } - // If there were no parse actions for the current lookahead token, then - // it is not valid in this state. If the current lookahead token is a - // keyword, then switch to treating it as the normal word token if that - // token is valid in this state. + // If the current lookahead token is a keyword that is not valid, but the + // default word token *is* valid, then treat the lookahead token as the word + // token instead. if ( ts_subtree_is_keyword(lookahead) && - ts_subtree_symbol(lookahead) != self->language->keyword_capture_token + ts_subtree_symbol(lookahead) != self->language->keyword_capture_token && + !ts_language_is_reserved_word(self->language, state, ts_subtree_symbol(lookahead)) ) { - ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry); + ts_language_table_entry( + self->language, + state, + self->language->keyword_capture_token, + &table_entry + ); if (table_entry.action_count > 0) { LOG( "switch from_keyword:%s, to_word_token:%s", @@ -1694,19 +1748,10 @@ static bool ts_parser__advance( } } - // If the current lookahead token is not valid and the parser is - // already in the error state, restart the error recovery process. - // TODO - can this be unified with the other `RECOVER` case above? - if (state == ERROR_STATE) { - ts_parser__recover(self, version, lookahead); - return true; - } - - // If the current lookahead token is not valid and the previous - // subtree on the stack was reused from an old tree, it isn't actually - // valid to reuse it. Remove it from the stack, and in its place, - // push each of its children. Then try again to process the current - // lookahead. + // If the current lookahead token is not valid and the previous subtree on + // the stack was reused from an old tree, then it wasn't actually valid to + // reuse that previous subtree. Remove it from the stack, and in its place, + // push each of its children. Then try again to process the current lookahead. if (ts_parser__breakdown_top_of_stack(self, version)) { state = ts_stack_state(self->stack, version); ts_subtree_release(&self->tree_pool, lookahead); @@ -1714,12 +1759,12 @@ static bool ts_parser__advance( continue; } - // At this point, the current lookahead token is definitely not valid - // for this parse stack version. Mark this version as paused and continue - // processing any other stack versions that might exist. If some other - // version advances successfully, then this version can simply be removed. - // But if all versions end up paused, then error recovery is needed. - LOG("detect_error"); + // Otherwise, there is definitely an error in this version of the parse stack. + // Mark this version as paused and continue processing any other stack + // versions that exist. If some other version advances successfully, then + // this version can simply be removed. But if all versions end up paused, + // then error recovery is needed. + LOG("detect_error lookahead:%s", TREE_NAME(lookahead)); ts_stack_pause(self->stack, version, lookahead); return true; } @@ -1808,6 +1853,7 @@ static unsigned ts_parser__condense_stack(TSParser *self) { has_unpaused_version = true; } else { ts_stack_remove_version(self->stack, i); + made_changes = true; i--; n--; } @@ -1825,8 +1871,66 @@ static unsigned ts_parser__condense_stack(TSParser *self) { return min_error_cost; } +static bool ts_parser__balance_subtree(TSParser *self) { + Subtree finished_tree = self->finished_tree; + + // If we haven't canceled balancing in progress before, then we want to clear the tree stack and + // push the initial finished tree onto it. Otherwise, if we're resuming balancing after a + // cancellation, we don't want to clear the tree stack. + if (!self->canceled_balancing) { + array_clear(&self->tree_pool.tree_stack); + if (ts_subtree_child_count(finished_tree) > 0 && finished_tree.ptr->ref_count == 1) { + array_push(&self->tree_pool.tree_stack, ts_subtree_to_mut_unsafe(finished_tree)); + } + } + + while (self->tree_pool.tree_stack.size > 0) { + if (!ts_parser__check_progress(self, NULL, NULL, 1)) { + return false; + } + + MutableSubtree tree = *array_get(&self->tree_pool.tree_stack, + self->tree_pool.tree_stack.size - 1 + ); + + if (tree.ptr->repeat_depth > 0) { + Subtree child1 = ts_subtree_children(tree)[0]; + Subtree child2 = ts_subtree_children(tree)[tree.ptr->child_count - 1]; + long repeat_delta = (long)ts_subtree_repeat_depth(child1) - (long)ts_subtree_repeat_depth(child2); + if (repeat_delta > 0) { + unsigned n = (unsigned)repeat_delta; + + for (unsigned i = n / 2; i > 0; i /= 2) { + ts_subtree_compress(tree, i, self->language, &self->tree_pool.tree_stack); + n -= i; + + // We scale the operation count increment in `ts_parser__check_progress` proportionately to the compression + // size since larger values of i take longer to process. Shifting by 4 empirically provides good check + // intervals (e.g. 193 operations when i=3100) to prevent blocking during large compressions. + uint8_t operations = i >> 4 > 0 ? i >> 4 : 1; + if (!ts_parser__check_progress(self, NULL, NULL, operations)) { + return false; + } + } + } + } + + (void)array_pop(&self->tree_pool.tree_stack); + + for (uint32_t i = 0; i < tree.ptr->child_count; i++) { + Subtree child = ts_subtree_children(tree)[i]; + if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) { + array_push(&self->tree_pool.tree_stack, ts_subtree_to_mut_unsafe(child)); + } + } + } + + return true; +} + static bool ts_parser_has_outstanding_parse(TSParser *self) { return ( + self->canceled_balancing || self->external_scanner_payload || ts_stack_state(self->stack, 0) != 1 || ts_stack_node_count_since_error(self->stack, 0) != 0 @@ -1849,6 +1953,8 @@ TSParser *ts_parser_new(void) { self->timeout_duration = 0; self->language = NULL; self->has_scanner_error = false; + self->has_error = false; + self->canceled_balancing = false; self->external_scanner_payload = NULL; self->end_clock = clock_null(); self->operation_count = 0; @@ -1896,8 +2002,8 @@ bool ts_parser_set_language(TSParser *self, const TSLanguage *language) { if (language) { if ( - language->version > TREE_SITTER_LANGUAGE_VERSION || - language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION + language->abi_version > TREE_SITTER_LANGUAGE_VERSION || + language->abi_version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION ) return false; if (ts_language_is_wasm(language)) { @@ -1985,6 +2091,10 @@ void ts_parser_reset(TSParser *self) { } self->accept_count = 0; self->has_scanner_error = false; + self->has_error = false; + self->canceled_balancing = false; + self->parse_options = (TSParseOptions) {0}; + self->parse_state = (TSParseState) {0}; } TSTree *ts_parser_parse( @@ -2004,8 +2114,16 @@ TSTree *ts_parser_parse( array_clear(&self->included_range_differences); self->included_range_difference_index = 0; + self->operation_count = 0; + if (self->timeout_duration) { + self->end_clock = clock_after(clock_now(), self->timeout_duration); + } else { + self->end_clock = clock_null(); + } + if (ts_parser_has_outstanding_parse(self)) { LOG("resume_parsing"); + if (self->canceled_balancing) goto balance; } else { ts_parser__external_scanner_create(self); if (self->has_scanner_error) goto exit; @@ -2022,7 +2140,7 @@ TSTree *ts_parser_parse( LOG("parse_after_edit"); LOG_TREE(self->old_tree); for (unsigned i = 0; i < self->included_range_differences.size; i++) { - TSRange *range = &self->included_range_differences.contents[i]; + TSRange *range = array_get(&self->included_range_differences, i); LOG("different_included_range %u - %u", range->start_byte, range->end_byte); } } else { @@ -2031,13 +2149,6 @@ TSTree *ts_parser_parse( } } - self->operation_count = 0; - if (self->timeout_duration) { - self->end_clock = clock_after(clock_now(), self->timeout_duration); - } else { - self->end_clock = clock_null(); - } - uint32_t position = 0, last_position = 0, version_count = 0; do { for ( @@ -2086,7 +2197,7 @@ TSTree *ts_parser_parse( } while (self->included_range_difference_index < self->included_range_differences.size) { - TSRange *range = &self->included_range_differences.contents[self->included_range_difference_index]; + TSRange *range = array_get(&self->included_range_differences, self->included_range_difference_index); if (range->end_byte <= position) { self->included_range_difference_index++; } else { @@ -2095,8 +2206,13 @@ TSTree *ts_parser_parse( } } while (version_count != 0); - assert(self->finished_tree.ptr); - ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); +balance: + ts_assert(self->finished_tree.ptr); + if (!ts_parser__balance_subtree(self)) { + self->canceled_balancing = true; + return false; + } + self->canceled_balancing = false; LOG("done"); LOG_TREE(self->finished_tree); @@ -2113,6 +2229,20 @@ TSTree *ts_parser_parse( return result; } +TSTree *ts_parser_parse_with_options( + TSParser *self, + const TSTree *old_tree, + TSInput input, + TSParseOptions parse_options +) { + self->parse_options = parse_options; + self->parse_state.payload = parse_options.payload; + TSTree *result = ts_parser_parse(self, old_tree, input); + // Reset parser options before further parse calls. + self->parse_options = (TSParseOptions) {0}; + return result; +} + TSTree *ts_parser_parse_string( TSParser *self, const TSTree *old_tree, @@ -2134,15 +2264,27 @@ TSTree *ts_parser_parse_string_encoding( &input, ts_string_input_read, encoding, + NULL, }); } void ts_parser_set_wasm_store(TSParser *self, TSWasmStore *store) { + if (self->language && ts_language_is_wasm(self->language)) { + // Copy the assigned language into the new store. + const TSLanguage *copy = ts_language_copy(self->language); + ts_parser_set_language(self, copy); + ts_language_delete(copy); + } + ts_wasm_store_delete(self->wasm_store); self->wasm_store = store; } TSWasmStore *ts_parser_take_wasm_store(TSParser *self) { + if (self->language && ts_language_is_wasm(self->language)) { + ts_parser_set_language(self, NULL); + } + TSWasmStore *result = self->wasm_store; self->wasm_store = NULL; return result; diff --git a/parser.h b/parser.h index 17f0e94b..858107de 100644 --- a/parser.h +++ b/parser.h @@ -18,6 +18,11 @@ typedef uint16_t TSStateId; typedef uint16_t TSSymbol; typedef uint16_t TSFieldId; typedef struct TSLanguage TSLanguage; +typedef struct TSLanguageMetadata { + uint8_t major_version; + uint8_t minor_version; + uint8_t patch_version; +} TSLanguageMetadata; #endif typedef struct { @@ -26,10 +31,11 @@ typedef struct { bool inherited; } TSFieldMapEntry; +// Used to index the field and supertype maps. typedef struct { uint16_t index; uint16_t length; -} TSFieldMapSlice; +} TSMapSlice; typedef struct { bool visible; @@ -47,6 +53,7 @@ struct TSLexer { uint32_t (*get_column)(TSLexer *); bool (*is_at_included_range_start)(const TSLexer *); bool (*eof)(const TSLexer *); + void (*log)(const TSLexer *, const char *, ...); }; typedef enum { @@ -78,6 +85,12 @@ typedef struct { uint16_t external_lex_state; } TSLexMode; +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; + uint16_t reserved_word_set_id; +} TSLexerMode; + typedef union { TSParseAction action; struct { @@ -92,7 +105,7 @@ typedef struct { } TSCharacterRange; struct TSLanguage { - uint32_t version; + uint32_t abi_version; uint32_t symbol_count; uint32_t alias_count; uint32_t token_count; @@ -108,13 +121,13 @@ struct TSLanguage { const TSParseActionEntry *parse_actions; const char * const *symbol_names; const char * const *field_names; - const TSFieldMapSlice *field_map_slices; + const TSMapSlice *field_map_slices; const TSFieldMapEntry *field_map_entries; const TSSymbolMetadata *symbol_metadata; const TSSymbol *public_symbol_map; const uint16_t *alias_map; const TSSymbol *alias_sequences; - const TSLexMode *lex_modes; + const TSLexerMode *lex_modes; bool (*lex_fn)(TSLexer *, TSStateId); bool (*keyword_lex_fn)(TSLexer *, TSStateId); TSSymbol keyword_capture_token; @@ -128,15 +141,23 @@ struct TSLanguage { void (*deserialize)(void *, const char *, unsigned); } external_scanner; const TSStateId *primary_state_ids; + const char *name; + const TSSymbol *reserved_words; + uint16_t max_reserved_word_set_size; + uint32_t supertype_count; + const TSSymbol *supertype_symbols; + const TSMapSlice *supertype_map_slices; + const TSSymbol *supertype_map_entries; + TSLanguageMetadata metadata; }; -static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { +static inline bool set_contains(const TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { uint32_t index = 0; uint32_t size = len - index; while (size > 1) { uint32_t half_size = size / 2; uint32_t mid_index = index + half_size; - TSCharacterRange *range = &ranges[mid_index]; + const TSCharacterRange *range = &ranges[mid_index]; if (lookahead >= range->start && lookahead <= range->end) { return true; } else if (lookahead > range->end) { @@ -144,7 +165,7 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t } size -= half_size; } - TSCharacterRange *range = &ranges[index]; + const TSCharacterRange *range = &ranges[index]; return (lookahead >= range->start && lookahead <= range->end); } diff --git a/point.h b/point.h index 942a86e1..9745a0b6 100644 --- a/point.h +++ b/point.h @@ -22,7 +22,7 @@ static inline TSPoint point_sub(TSPoint a, TSPoint b) { if (a.row > b.row) return point__new(a.row - b.row, a.column); else - return point__new(0, a.column - b.column); + return point__new(0, (a.column >= b.column) ? a.column - b.column : 0); } static inline bool point_lte(TSPoint a, TSPoint b) { @@ -45,18 +45,4 @@ static inline bool point_eq(TSPoint a, TSPoint b) { return a.row == b.row && a.column == b.column; } -static inline TSPoint point_min(TSPoint a, TSPoint b) { - if (a.row < b.row || (a.row == b.row && a.column < b.column)) - return a; - else - return b; -} - -static inline TSPoint point_max(TSPoint a, TSPoint b) { - if (a.row > b.row || (a.row == b.row && a.column > b.column)) - return a; - else - return b; -} - #endif diff --git a/query.c b/query.c index 1b6e04b6..a0d5426e 100644 --- a/query.c +++ b/query.c @@ -1,6 +1,16 @@ +/* + * On NetBSD, defining standard requirements like this removes symbols + * from the namespace; however, we need non-standard symbols for + * endian.h. + */ +#if defined(__NetBSD__) && defined(_POSIX_C_SOURCE) +#undef _POSIX_C_SOURCE +#endif + #include "api.h" #include "./alloc.h" #include "./array.h" +#include "./clock.h" #include "./language.h" #include "./point.h" #include "./tree_cursor.h" @@ -80,7 +90,7 @@ typedef struct { * for the entire top-level pattern. When iterating through a query's * captures using `ts_query_cursor_next_capture`, this field is used to * detect that a capture can safely be returned from a match that has not - * even completed yet. + * even completed yet. */ typedef struct { TSSymbol symbol; @@ -99,6 +109,7 @@ typedef struct { bool contains_captures: 1; bool root_pattern_guaranteed: 1; bool parent_pattern_guaranteed: 1; + bool is_missing: 1; } QueryStep; /* @@ -121,7 +132,7 @@ typedef struct { } SymbolTable; /** - * CaptureQuantififers - a data structure holding the quantifiers of pattern captures. + * CaptureQuantifiers - a data structure holding the quantifiers of pattern captures. */ typedef Array(uint8_t) CaptureQuantifiers; @@ -146,6 +157,7 @@ typedef struct { Slice steps; Slice predicate_steps; uint32_t start_byte; + uint32_t end_byte; bool is_non_local; } QueryPattern; @@ -171,7 +183,8 @@ typedef struct { * list of captures from the `CaptureListPool`. * - `seeking_immediate_match` - A flag that indicates that the state's next * step must be matched by the very next sibling. This is used when - * processing repetitions. + * processing repetitions, or when processing a wildcard node followed by + * an anchor. * - `has_in_progress_alternatives` - A flag that indicates that there is are * other states that have the same captures as this state, but are at * different steps in their pattern. This means that in order to obey the @@ -311,6 +324,11 @@ struct TSQueryCursor { TSPoint start_point; TSPoint end_point; uint32_t next_state_id; + TSClock end_clock; + TSDuration timeout_duration; + const TSQueryCursorOptions *query_options; + TSQueryCursorState query_state; + unsigned operation_count; bool on_visible_node; bool ascending; bool halted; @@ -321,6 +339,7 @@ static const TSQueryError PARENT_DONE = -1; static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX; static const uint16_t NONE = UINT16_MAX; static const TSSymbol WILDCARD_SYMBOL = 0; +static const unsigned OP_COUNT_PER_QUERY_TIMEOUT_CHECK = 100; /********** * Stream @@ -418,26 +437,26 @@ static CaptureListPool capture_list_pool_new(void) { static void capture_list_pool_reset(CaptureListPool *self) { for (uint16_t i = 0; i < (uint16_t)self->list.size; i++) { // This invalid size means that the list is not in use. - self->list.contents[i].size = UINT32_MAX; + array_get(&self->list, i)->size = UINT32_MAX; } self->free_capture_list_count = self->list.size; } static void capture_list_pool_delete(CaptureListPool *self) { for (uint16_t i = 0; i < (uint16_t)self->list.size; i++) { - array_delete(&self->list.contents[i]); + array_delete(array_get(&self->list, i)); } array_delete(&self->list); } static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) { if (id >= self->list.size) return &self->empty_list; - return &self->list.contents[id]; + return array_get(&self->list, id); } static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) { - assert(id < self->list.size); - return &self->list.contents[id]; + ts_assert(id < self->list.size); + return array_get(&self->list, id); } static bool capture_list_pool_is_empty(const CaptureListPool *self) { @@ -450,8 +469,8 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) { // First see if any already allocated capture list is currently unused. if (self->free_capture_list_count > 0) { for (uint16_t i = 0; i < (uint16_t)self->list.size; i++) { - if (self->list.contents[i].size == UINT32_MAX) { - array_clear(&self->list.contents[i]); + if (array_get(&self->list, i)->size == UINT32_MAX) { + array_clear(array_get(&self->list, i)); self->free_capture_list_count--; return i; } @@ -472,7 +491,7 @@ static uint16_t capture_list_pool_acquire(CaptureListPool *self) { static void capture_list_pool_release(CaptureListPool *self, uint16_t id) { if (id >= self->list.size) return; - self->list.contents[id].size = UINT32_MAX; + array_get(&self->list, id)->size = UINT32_MAX; self->free_capture_list_count++; } @@ -755,10 +774,10 @@ static int symbol_table_id_for_name( uint32_t length ) { for (unsigned i = 0; i < self->slices.size; i++) { - Slice slice = self->slices.contents[i]; + Slice slice = *array_get(&self->slices, i); if ( slice.length == length && - !strncmp(&self->characters.contents[slice.offset], name, length) + !strncmp(array_get(&self->characters, slice.offset), name, length) ) return i; } return -1; @@ -769,9 +788,9 @@ static const char *symbol_table_name_for_id( uint16_t id, uint32_t *length ) { - Slice slice = self->slices.contents[id]; + Slice slice = *(array_get(&self->slices,id)); *length = slice.length; - return &self->characters.contents[slice.offset]; + return array_get(&self->characters, slice.offset); } static uint16_t symbol_table_insert_name( @@ -786,8 +805,8 @@ static uint16_t symbol_table_insert_name( .length = length, }; array_grow_by(&self->characters, length + 1); - memcpy(&self->characters.contents[slice.offset], name, length); - self->characters.contents[self->characters.size - 1] = 0; + memcpy(array_get(&self->characters, slice.offset), name, length); + *array_get(&self->characters, self->characters.size - 1) = 0; array_push(&self->slices, slice); return self->slices.size - 1; } @@ -909,38 +928,29 @@ static unsigned analysis_state__recursion_depth(const AnalysisState *self) { return result; } -static inline int analysis_state__compare_position( +static inline int analysis_state__compare( AnalysisState *const *self, AnalysisState *const *other ) { + if ((*self)->depth < (*other)->depth) return 1; for (unsigned i = 0; i < (*self)->depth; i++) { if (i >= (*other)->depth) return -1; - if ((*self)->stack[i].child_index < (*other)->stack[i].child_index) return -1; - if ((*self)->stack[i].child_index > (*other)->stack[i].child_index) return 1; + AnalysisStateEntry s1 = (*self)->stack[i]; + AnalysisStateEntry s2 = (*other)->stack[i]; + if (s1.child_index < s2.child_index) return -1; + if (s1.child_index > s2.child_index) return 1; + if (s1.parent_symbol < s2.parent_symbol) return -1; + if (s1.parent_symbol > s2.parent_symbol) return 1; + if (s1.parse_state < s2.parse_state) return -1; + if (s1.parse_state > s2.parse_state) return 1; + if (s1.field_id < s2.field_id) return -1; + if (s1.field_id > s2.field_id) return 1; } - if ((*self)->depth < (*other)->depth) return 1; if ((*self)->step_index < (*other)->step_index) return -1; if ((*self)->step_index > (*other)->step_index) return 1; return 0; } -static inline int analysis_state__compare( - AnalysisState *const *self, - AnalysisState *const *other -) { - int result = analysis_state__compare_position(self, other); - if (result != 0) return result; - for (unsigned i = 0; i < (*self)->depth; i++) { - if ((*self)->stack[i].parent_symbol < (*other)->stack[i].parent_symbol) return -1; - if ((*self)->stack[i].parent_symbol > (*other)->stack[i].parent_symbol) return 1; - if ((*self)->stack[i].parse_state < (*other)->stack[i].parse_state) return -1; - if ((*self)->stack[i].parse_state > (*other)->stack[i].parse_state) return 1; - if ((*self)->stack[i].field_id < (*other)->stack[i].field_id) return -1; - if ((*self)->stack[i].field_id > (*other)->stack[i].field_id) return 1; - } - return 0; -} - static inline AnalysisStateEntry *analysis_state__top(AnalysisState *self) { if (self->depth == 0) { return &self->stack[0]; @@ -1099,23 +1109,23 @@ static inline bool ts_query__pattern_map_search( while (size > 1) { uint32_t half_size = size / 2; uint32_t mid_index = base_index + half_size; - TSSymbol mid_symbol = self->steps.contents[ - self->pattern_map.contents[mid_index].step_index - ].symbol; + TSSymbol mid_symbol = array_get(&self->steps, + array_get(&self->pattern_map, mid_index)->step_index + )->symbol; if (needle > mid_symbol) base_index = mid_index; size -= half_size; } - TSSymbol symbol = self->steps.contents[ - self->pattern_map.contents[base_index].step_index - ].symbol; + TSSymbol symbol = array_get(&self->steps, + array_get(&self->pattern_map, base_index)->step_index + )->symbol; if (needle > symbol) { base_index++; if (base_index < self->pattern_map.size) { - symbol = self->steps.contents[ - self->pattern_map.contents[base_index].step_index - ].symbol; + symbol = array_get(&self->steps, + array_get(&self->pattern_map, base_index)->step_index + )->symbol; } } @@ -1138,9 +1148,9 @@ static inline void ts_query__pattern_map_insert( // initiated first, which allows the ordering of the states array // to be maintained more efficiently. while (index < self->pattern_map.size) { - PatternEntry *entry = &self->pattern_map.contents[index]; + PatternEntry *entry = array_get(&self->pattern_map, index); if ( - self->steps.contents[entry->step_index].symbol == symbol && + array_get(&self->steps, entry->step_index)->symbol == symbol && entry->pattern_index < new_entry.pattern_index ) { index++; @@ -1173,11 +1183,11 @@ static void ts_query__perform_analysis( #ifdef DEBUG_ANALYZE_QUERY printf("Iteration: %u. Final step indices:", iteration); for (unsigned j = 0; j < analysis->final_step_indices.size; j++) { - printf(" %4u", analysis->final_step_indices.contents[j]); + printf(" %4u", *array_get(&analysis->final_step_indices, j)); } printf("\n"); for (unsigned j = 0; j < analysis->states.size; j++) { - AnalysisState *state = analysis->states.contents[j]; + AnalysisState *state = *array_get(&analysis->states, j); printf(" %3u: step: %u, stack: [", j, state->step_index); for (unsigned k = 0; k < state->depth; k++) { printf( @@ -1220,7 +1230,7 @@ static void ts_query__perform_analysis( analysis_state_set__clear(&analysis->next_states, &analysis->state_pool); for (unsigned j = 0; j < analysis->states.size; j++) { - AnalysisState * const state = analysis->states.contents[j]; + AnalysisState * const state = *array_get(&analysis->states, j); // For efficiency, it's important to avoid processing the same analysis state more // than once. To achieve this, keep the states in order of ascending position within @@ -1228,7 +1238,7 @@ static void ts_query__perform_analysis( // the states that have made the least progress. Avoid advancing states that have already // made more progress. if (analysis->next_states.size > 0) { - int comparison = analysis_state__compare_position( + int comparison = analysis_state__compare( &state, array_back(&analysis->next_states) ); @@ -1243,7 +1253,7 @@ static void ts_query__perform_analysis( analysis_state_set__push( &analysis->next_states, &analysis->state_pool, - analysis->states.contents[j] + *array_get(&analysis->states, j) ); j++; } @@ -1255,12 +1265,12 @@ static void ts_query__perform_analysis( const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; const TSFieldId parent_field_id = analysis_state__top(state)->field_id; const unsigned child_index = analysis_state__top(state)->child_index; - const QueryStep * const step = &self->steps.contents[state->step_index]; + const QueryStep * const step = array_get(&self->steps, state->step_index); unsigned subgraph_index, exists; array_search_sorted_by(subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); if (!exists) continue; - const AnalysisSubgraph *subgraph = &subgraphs->contents[subgraph_index]; + const AnalysisSubgraph *subgraph = array_get(subgraphs, subgraph_index); // Follow every possible path in the parse table, but only visit states that // are part of the subgraph for the current symbol. @@ -1296,7 +1306,8 @@ static void ts_query__perform_analysis( &node_index, &exists ); while (node_index < subgraph->nodes.size) { - AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; + AnalysisSubgraphNode *node = array_get(&subgraph->nodes, node_index); + node_index++; if (node->state != successor.state || node->child_index != successor.child_index) break; // Use the subgraph to determine what alias and field will eventually be applied @@ -1329,7 +1340,12 @@ static void ts_query__perform_analysis( // Determine if this hypothetical child node would match the current step // of the query pattern. bool does_match = false; - if (visible_symbol) { + + // ERROR nodes can appear anywhere, so if the step is + // looking for an ERROR node, consider it potentially matchable. + if (step->symbol == ts_builtin_sym_error) { + does_match = true; + } else if (visible_symbol) { does_match = true; if (step->symbol == WILDCARD_SYMBOL) { if ( @@ -1397,7 +1413,7 @@ static void ts_query__perform_analysis( if (does_match) { for (;;) { next_state.step_index++; - next_step = &self->steps.contents[next_state.step_index]; + next_step = array_get(&self->steps, next_state.step_index); if ( next_step->depth == PATTERN_DONE_MARKER || next_step->depth <= step->depth @@ -1421,7 +1437,7 @@ static void ts_query__perform_analysis( // record that matching can terminate at this step of the pattern. Otherwise, // add this state to the list of states to process on the next iteration. if (!next_step->is_dead_end) { - bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != step->depth; + bool did_finish_pattern = array_get(&self->steps, next_state.step_index)->depth != step->depth; if (did_finish_pattern) { array_insert_sorted_by(&analysis->finished_parent_symbols, , state->root_symbol); } else if (next_state.depth == 0) { @@ -1441,7 +1457,7 @@ static void ts_query__perform_analysis( next_step->alternative_index > next_state.step_index ) { next_state.step_index = next_step->alternative_index; - next_step = &self->steps.contents[next_state.step_index]; + next_step = array_get(&self->steps, next_state.step_index); } else { break; } @@ -1459,9 +1475,9 @@ static void ts_query__perform_analysis( static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { Array(uint16_t) non_rooted_pattern_start_steps = array_new(); for (unsigned i = 0; i < self->pattern_map.size; i++) { - PatternEntry *pattern = &self->pattern_map.contents[i]; + PatternEntry *pattern = array_get(&self->pattern_map, i); if (!pattern->is_rooted) { - QueryStep *step = &self->steps.contents[pattern->step_index]; + QueryStep *step = array_get(&self->steps, pattern->step_index); if (step->symbol != WILDCARD_SYMBOL) { array_push(&non_rooted_pattern_start_steps, i); } @@ -1473,7 +1489,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // captures, and record the indices of all of the steps that have child steps. Array(uint32_t) parent_step_indices = array_new(); for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; + QueryStep *step = array_get(&self->steps, i); if (step->depth == PATTERN_DONE_MARKER) { step->parent_pattern_guaranteed = true; step->root_pattern_guaranteed = true; @@ -1484,7 +1500,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { bool is_wildcard = step->symbol == WILDCARD_SYMBOL; step->contains_captures = step->capture_ids[0] != NONE; for (unsigned j = i + 1; j < self->steps.size; j++) { - QueryStep *next_step = &self->steps.contents[j]; + QueryStep *next_step = array_get(&self->steps, j); if ( next_step->depth == PATTERN_DONE_MARKER || next_step->depth <= step->depth @@ -1514,8 +1530,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // parent. AnalysisSubgraphArray subgraphs = array_new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { - uint32_t parent_step_index = parent_step_indices.contents[i]; - TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; + uint32_t parent_step_index = *array_get(&parent_step_indices, i); + TSSymbol parent_symbol = array_get(&self->steps, parent_step_index)->symbol; AnalysisSubgraph subgraph = { .symbol = parent_symbol }; array_insert_sorted_by(&subgraphs, .symbol, subgraph); } @@ -1557,7 +1573,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { &exists ); if (exists) { - AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + AnalysisSubgraph *subgraph = array_get(&subgraphs, subgraph_index); if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) { array_push(&subgraph->nodes, ((AnalysisSubgraphNode) { .state = state, @@ -1594,7 +1610,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { &exists ); if (exists) { - AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + AnalysisSubgraph *subgraph = array_get(&subgraphs, subgraph_index); if ( subgraph->start_states.size == 0 || *array_back(&subgraph->start_states) != state @@ -1611,7 +1627,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // from the end states using the predecessor map. Array(AnalysisSubgraphNode) next_nodes = array_new(); for (unsigned i = 0; i < subgraphs.size; i++) { - AnalysisSubgraph *subgraph = &subgraphs.contents[i]; + AnalysisSubgraph *subgraph = array_get(&subgraphs, i); if (subgraph->nodes.size == 0) { array_delete(&subgraph->start_states); array_erase(&subgraphs, i); @@ -1652,16 +1668,16 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { #ifdef DEBUG_ANALYZE_QUERY printf("\nSubgraphs:\n"); for (unsigned i = 0; i < subgraphs.size; i++) { - AnalysisSubgraph *subgraph = &subgraphs.contents[i]; + AnalysisSubgraph *subgraph = array_get(&subgraphs, i); printf(" %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol)); for (unsigned j = 0; j < subgraph->start_states.size; j++) { printf( " {state: %u}\n", - subgraph->start_states.contents[j] + *array_get(&subgraph->start_states, j) ); } for (unsigned j = 0; j < subgraph->nodes.size; j++) { - AnalysisSubgraphNode *node = &subgraph->nodes.contents[j]; + AnalysisSubgraphNode *node = array_get(&subgraph->nodes, j); printf( " {state: %u, child_index: %u, production_id: %u, done: %d}\n", node->state, node->child_index, node->production_id, node->done @@ -1676,9 +1692,9 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { bool all_patterns_are_valid = true; QueryAnalysis analysis = query_analysis__new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { - uint16_t parent_step_index = parent_step_indices.contents[i]; - uint16_t parent_depth = self->steps.contents[parent_step_index].depth; - TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; + uint16_t parent_step_index = *array_get(&parent_step_indices, i); + uint16_t parent_depth = array_get(&self->steps, parent_step_index)->depth; + TSSymbol parent_symbol = array_get(&self->steps, parent_step_index)->symbol; if (parent_symbol == ts_builtin_sym_error) continue; // Find the subgraph that corresponds to this pattern's root symbol. If the pattern's @@ -1689,19 +1705,19 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { unsigned first_child_step_index = parent_step_index + 1; uint32_t j, child_exists; array_search_sorted_by(&self->step_offsets, .step_index, first_child_step_index, &j, &child_exists); - assert(child_exists); - *error_offset = self->step_offsets.contents[j].byte_offset; + ts_assert(child_exists); + *error_offset = array_get(&self->step_offsets, j)->byte_offset; all_patterns_are_valid = false; break; } // Initialize an analysis state at every parse state in the table where // this parent symbol can occur. - AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; + AnalysisSubgraph *subgraph = array_get(&subgraphs, subgraph_index); analysis_state_set__clear(&analysis.states, &analysis.state_pool); analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); for (unsigned j = 0; j < subgraph->start_states.size; j++) { - TSStateId parse_state = subgraph->start_states.contents[j]; + TSStateId parse_state = *array_get(&subgraph->start_states, j); analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { .step_index = parent_step_index + 1, .stack = { @@ -1721,7 +1737,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { #ifdef DEBUG_ANALYZE_QUERY printf( "\nWalk states for %s:\n", - ts_language_symbol_name(self->language, analysis.states.contents[0]->stack[0].parent_symbol) + ts_language_symbol_name(self->language, (*array_get(&analysis.states, 0))->stack[0].parent_symbol) ); #endif @@ -1732,7 +1748,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // be considered fallible. if (analysis.did_abort) { for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) { - QueryStep *step = &self->steps.contents[j]; + QueryStep *step = array_get(&self->steps, j); if ( step->depth <= parent_depth || step->depth == PATTERN_DONE_MARKER @@ -1748,12 +1764,12 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // If this pattern cannot match, store the pattern index so that it can be // returned to the caller. if (analysis.finished_parent_symbols.size == 0) { - assert(analysis.final_step_indices.size > 0); + ts_assert(analysis.final_step_indices.size > 0); uint16_t impossible_step_index = *array_back(&analysis.final_step_indices); uint32_t j, impossible_exists; array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &j, &impossible_exists); if (j >= self->step_offsets.size) j = self->step_offsets.size - 1; - *error_offset = self->step_offsets.contents[j].byte_offset; + *error_offset = array_get(&self->step_offsets, j)->byte_offset; all_patterns_are_valid = false; break; } @@ -1761,8 +1777,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Mark as fallible any step where a match terminated. // Later, this property will be propagated to all of the step's predecessors. for (unsigned j = 0; j < analysis.final_step_indices.size; j++) { - uint32_t final_step_index = analysis.final_step_indices.contents[j]; - QueryStep *step = &self->steps.contents[final_step_index]; + uint32_t final_step_index = *array_get(&analysis.final_step_indices, j); + QueryStep *step = array_get(&self->steps, final_step_index); if ( step->depth != PATTERN_DONE_MARKER && step->depth > parent_depth && @@ -1777,7 +1793,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Mark as indefinite any step with captures that are used in predicates. Array(uint16_t) predicate_capture_ids = array_new(); for (unsigned i = 0; i < self->patterns.size; i++) { - QueryPattern *pattern = &self->patterns.contents[i]; + QueryPattern *pattern = array_get(&self->patterns, i); // Gather all of the captures that are used in predicates for this pattern. array_clear(&predicate_capture_ids); @@ -1786,7 +1802,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { end = start + pattern->predicate_steps.length, j = start; j < end; j++ ) { - TSQueryPredicateStep *step = &self->predicate_steps.contents[j]; + TSQueryPredicateStep *step = array_get(&self->predicate_steps, j); if (step->type == TSQueryPredicateStepTypeCapture) { uint16_t value_id = step->value_id; array_insert_sorted_by(&predicate_capture_ids, , value_id); @@ -1799,7 +1815,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { end = start + pattern->steps.length, j = start; j < end; j++ ) { - QueryStep *step = &self->steps.contents[j]; + QueryStep *step = array_get(&self->steps, j); for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) { uint16_t capture_id = step->capture_ids[k]; if (capture_id == NONE) break; @@ -1819,7 +1835,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { while (!done) { done = true; for (unsigned i = self->steps.size - 1; i > 0; i--) { - QueryStep *step = &self->steps.contents[i]; + QueryStep *step = array_get(&self->steps, i); if (step->depth == PATTERN_DONE_MARKER) continue; // Determine if this step is definite or has definite alternatives. @@ -1832,12 +1848,12 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { if (step->alternative_index == NONE || step->alternative_index < i) { break; } - step = &self->steps.contents[step->alternative_index]; + step = array_get(&self->steps, step->alternative_index); } // If not, mark its predecessor as indefinite. if (!parent_pattern_guaranteed) { - QueryStep *prev_step = &self->steps.contents[i - 1]; + QueryStep *prev_step = array_get(&self->steps, i - 1); if ( !prev_step->is_dead_end && prev_step->depth != PATTERN_DONE_MARKER && @@ -1853,7 +1869,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { #ifdef DEBUG_ANALYZE_QUERY printf("Steps:\n"); for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; + QueryStep *step = array_get(&self->steps, i); if (step->depth == PATTERN_DONE_MARKER) { printf(" %u: DONE\n", i); } else { @@ -1877,18 +1893,18 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // prevent certain optimizations with range restrictions. analysis.did_abort = false; for (uint32_t i = 0; i < non_rooted_pattern_start_steps.size; i++) { - uint16_t pattern_entry_index = non_rooted_pattern_start_steps.contents[i]; - PatternEntry *pattern_entry = &self->pattern_map.contents[pattern_entry_index]; + uint16_t pattern_entry_index = *array_get(&non_rooted_pattern_start_steps, i); + PatternEntry *pattern_entry = array_get(&self->pattern_map, pattern_entry_index); analysis_state_set__clear(&analysis.states, &analysis.state_pool); analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); for (unsigned j = 0; j < subgraphs.size; j++) { - AnalysisSubgraph *subgraph = &subgraphs.contents[j]; + AnalysisSubgraph *subgraph = array_get(&subgraphs, j); TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, subgraph->symbol); if (metadata.visible || metadata.named) continue; for (uint32_t k = 0; k < subgraph->start_states.size; k++) { - TSStateId parse_state = subgraph->start_states.contents[k]; + TSStateId parse_state = *array_get(&subgraph->start_states, k); analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { .step_index = pattern_entry->step_index, .stack = { @@ -1917,11 +1933,11 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { ); if (analysis.finished_parent_symbols.size > 0) { - self->patterns.contents[pattern_entry->pattern_index].is_non_local = true; + array_get(&self->patterns, pattern_entry->pattern_index)->is_non_local = true; } for (unsigned k = 0; k < analysis.finished_parent_symbols.size; k++) { - TSSymbol symbol = analysis.finished_parent_symbols.contents[k]; + TSSymbol symbol = *array_get(&analysis.finished_parent_symbols, k); array_insert_sorted_by(&self->repeat_symbols_with_rootless_patterns, , symbol); } } @@ -1931,7 +1947,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { printf("\nRepetition symbols with rootless patterns:\n"); printf("aborted analysis: %d\n", analysis.did_abort); for (unsigned i = 0; i < self->repeat_symbols_with_rootless_patterns.size; i++) { - TSSymbol symbol = self->repeat_symbols_with_rootless_patterns.contents[i]; + TSSymbol symbol = *array_get(&self->repeat_symbols_with_rootless_patterns, i); printf(" %u, %s\n", symbol, ts_language_symbol_name(self->language, symbol)); } printf("\n"); @@ -1940,8 +1956,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Cleanup for (unsigned i = 0; i < subgraphs.size; i++) { - array_delete(&subgraphs.contents[i].start_states); - array_delete(&subgraphs.contents[i].nodes); + array_delete(&array_get(&subgraphs, i)->start_states); + array_delete(&array_get(&subgraphs, i)->nodes); } array_delete(&subgraphs); query_analysis__delete(&analysis); @@ -1960,7 +1976,7 @@ static void ts_query__add_negated_fields( TSFieldId *field_ids, uint16_t field_count ) { - QueryStep *step = &self->steps.contents[step_index]; + QueryStep *step = array_get(&self->steps, step_index); // The negated field array stores a list of field lists, separated by zeros. // Try to find the start index of an existing list that matches this new list. @@ -1968,7 +1984,7 @@ static void ts_query__add_negated_fields( unsigned match_count = 0; unsigned start_i = 0; for (unsigned i = 0; i < self->negated_fields.size; i++) { - TSFieldId existing_field_id = self->negated_fields.contents[i]; + TSFieldId existing_field_id = *array_get(&self->negated_fields, i); // At each zero value, terminate the match attempt. If we've exactly // matched the new field list, then reuse this index. Otherwise, @@ -2238,10 +2254,10 @@ static TSQueryError ts_query__parse_pattern( // For all of the branches except for the last one, add the subsequent branch as an // alternative, and link the end of the branch to the current end of the steps. for (unsigned i = 0; i < branch_step_indices.size - 1; i++) { - uint32_t step_index = branch_step_indices.contents[i]; - uint32_t next_step_index = branch_step_indices.contents[i + 1]; - QueryStep *start_step = &self->steps.contents[step_index]; - QueryStep *end_step = &self->steps.contents[next_step_index - 1]; + uint32_t step_index = *array_get(&branch_step_indices, i); + uint32_t next_step_index = *array_get(&branch_step_indices, i + 1); + QueryStep *start_step = array_get(&self->steps, step_index); + QueryStep *end_step = array_get(&self->steps, next_step_index - 1); start_step->alternative_index = next_step_index; end_step->alternative_index = self->steps.size; end_step->is_dead_end = true; @@ -2305,16 +2321,62 @@ static TSQueryError ts_query__parse_pattern( // Otherwise, this parenthesis is the start of a named node. else { TSSymbol symbol; + bool is_missing = false; + const char *node_name = stream->input; // Parse a normal node name if (stream_is_ident_start(stream)) { - const char *node_name = stream->input; stream_scan_identifier(stream); uint32_t length = (uint32_t)(stream->input - node_name); // Parse the wildcard symbol if (length == 1 && node_name[0] == '_') { symbol = WILDCARD_SYMBOL; + } else if (!strncmp(node_name, "MISSING", length)) { + is_missing = true; + stream_skip_whitespace(stream); + + if (stream_is_ident_start(stream)) { + const char *missing_node_name = stream->input; + stream_scan_identifier(stream); + uint32_t missing_node_length = (uint32_t)(stream->input - missing_node_name); + symbol = ts_language_symbol_for_name( + self->language, + missing_node_name, + missing_node_length, + true + ); + if (!symbol) { + stream_reset(stream, missing_node_name); + return TSQueryErrorNodeType; + } + } + + else if (stream->next == '"') { + const char *string_start = stream->input; + TSQueryError e = ts_query__parse_string_literal(self, stream); + if (e) return e; + + symbol = ts_language_symbol_for_name( + self->language, + self->string_buffer.contents, + self->string_buffer.size, + false + ); + if (!symbol) { + stream_reset(stream, string_start + 1); + return TSQueryErrorNodeType; + } + } + + else if (stream->next == ')') { + symbol = WILDCARD_SYMBOL; + } + + else { + stream_reset(stream, stream->input); + return TSQueryErrorSyntax; + } } else { @@ -2340,6 +2402,9 @@ static TSQueryError ts_query__parse_pattern( step->supertype_symbol = step->symbol; step->symbol = WILDCARD_SYMBOL; } + if (is_missing) { + step->is_missing = true; + } if (symbol == WILDCARD_SYMBOL) { step->is_named = true; } @@ -2347,26 +2412,56 @@ static TSQueryError ts_query__parse_pattern( stream_skip_whitespace(stream); if (stream->next == '/') { + if (!step->supertype_symbol) { + stream_reset(stream, node_name - 1); // reset to the start of the node + return TSQueryErrorStructure; + } + stream_advance(stream); if (!stream_is_ident_start(stream)) { return TSQueryErrorSyntax; } - const char *node_name = stream->input; + const char *subtype_node_name = stream->input; stream_scan_identifier(stream); - uint32_t length = (uint32_t)(stream->input - node_name); + uint32_t length = (uint32_t)(stream->input - subtype_node_name); step->symbol = ts_language_symbol_for_name( self->language, - node_name, + subtype_node_name, length, true ); if (!step->symbol) { - stream_reset(stream, node_name); + stream_reset(stream, subtype_node_name); return TSQueryErrorNodeType; } + // Get all the possible subtypes for the given supertype, + // and check if the given subtype is valid. + if (self->language->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS) { + uint32_t subtype_length; + const TSSymbol *subtypes = ts_language_subtypes( + self->language, + step->supertype_symbol, + &subtype_length + ); + + bool subtype_is_valid = false; + for (uint32_t i = 0; i < subtype_length; i++) { + if (subtypes[i] == step->symbol) { + subtype_is_valid = true; + break; + } + } + + // This subtype is not valid for the given supertype. + if (!subtype_is_valid) { + stream_reset(stream, node_name - 1); // reset to the start of the node + return TSQueryErrorStructure; + } + } + stream_skip_whitespace(stream); } @@ -2425,6 +2520,9 @@ static TSQueryError ts_query__parse_pattern( child_is_immediate, &child_capture_quantifiers ); + // In the event we only parsed a predicate, meaning no new steps were added, + // then subtract one so we're not indexing past the end of the array + if (step_index == self->steps.size) step_index--; if (e == PARENT_DONE) { if (stream->next == ')') { if (child_is_immediate) { @@ -2432,7 +2530,23 @@ static TSQueryError ts_query__parse_pattern( capture_quantifiers_delete(&child_capture_quantifiers); return TSQueryErrorSyntax; } - self->steps.contents[last_child_step_index].is_last_child = true; + // Mark this step *and* its alternatives as the last child of the parent. + QueryStep *last_child_step = array_get(&self->steps, last_child_step_index); + last_child_step->is_last_child = true; + if ( + last_child_step->alternative_index != NONE && + last_child_step->alternative_index < self->steps.size + ) { + QueryStep *alternative_step = array_get(&self->steps, last_child_step->alternative_index); + alternative_step->is_last_child = true; + while ( + alternative_step->alternative_index != NONE && + alternative_step->alternative_index < self->steps.size + ) { + alternative_step = array_get(&self->steps, alternative_step->alternative_index); + alternative_step->is_last_child = true; + } + } } if (negated_field_count) { @@ -2535,7 +2649,7 @@ static TSQueryError ts_query__parse_pattern( } uint32_t step_index = starting_step_index; - QueryStep *step = &self->steps.contents[step_index]; + QueryStep *step = array_get(&self->steps, step_index); for (;;) { step->field = field_id; if ( @@ -2544,7 +2658,7 @@ static TSQueryError ts_query__parse_pattern( step->alternative_index < self->steps.size ) { step_index = step->alternative_index; - step = &self->steps.contents[step_index]; + step = array_get(&self->steps, step_index); } else { break; } @@ -2593,9 +2707,9 @@ static TSQueryError ts_query__parse_pattern( // Stop when `step->alternative_index` is `NONE` or it points to // `repeat_step` or beyond. Note that having just been pushed, // `repeat_step` occupies slot `self->steps.size - 1`. - QueryStep *step = &self->steps.contents[starting_step_index]; + QueryStep *step = array_get(&self->steps, starting_step_index); while (step->alternative_index != NONE && step->alternative_index < self->steps.size - 1) { - step = &self->steps.contents[step->alternative_index]; + step = array_get(&self->steps, step->alternative_index); } step->alternative_index = self->steps.size; } @@ -2607,9 +2721,9 @@ static TSQueryError ts_query__parse_pattern( stream_advance(stream); stream_skip_whitespace(stream); - QueryStep *step = &self->steps.contents[starting_step_index]; + QueryStep *step = array_get(&self->steps, starting_step_index); while (step->alternative_index != NONE && step->alternative_index < self->steps.size) { - step = &self->steps.contents[step->alternative_index]; + step = array_get(&self->steps, step->alternative_index); } step->alternative_index = self->steps.size; } @@ -2635,7 +2749,7 @@ static TSQueryError ts_query__parse_pattern( uint32_t step_index = starting_step_index; for (;;) { - QueryStep *step = &self->steps.contents[step_index]; + QueryStep *step = array_get(&self->steps, step_index); query_step__add_capture(step, capture_id); if ( step->alternative_index != NONE && @@ -2669,8 +2783,8 @@ TSQuery *ts_query_new( ) { if ( !language || - language->version > TREE_SITTER_LANGUAGE_VERSION || - language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION + language->abi_version > TREE_SITTER_LANGUAGE_VERSION || + language->abi_version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION ) { *error_type = TSQueryErrorLanguage; return NULL; @@ -2715,6 +2829,7 @@ TSQuery *ts_query_new( QueryPattern *pattern = array_back(&self->patterns); pattern->steps.length = self->steps.size - start_step_index; pattern->predicate_steps.length = self->predicate_steps.size - start_predicate_step_index; + pattern->end_byte = stream_offset(&stream); // If any pattern could not be parsed, then report the error information // and terminate. @@ -2732,15 +2847,15 @@ TSQuery *ts_query_new( // Maintain a map that can look up patterns for a given root symbol. uint16_t wildcard_root_alternative_index = NONE; for (;;) { - QueryStep *step = &self->steps.contents[start_step_index]; + QueryStep *step = array_get(&self->steps, start_step_index); // If a pattern has a wildcard at its root, but it has a non-wildcard child, // then optimize the matching process by skipping matching the wildcard. // Later, during the matching process, the query cursor will check that // there is a parent node, and capture it if necessary. if (step->symbol == WILDCARD_SYMBOL && step->depth == 0 && !step->field) { - QueryStep *second_step = &self->steps.contents[start_step_index + 1]; - if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1) { + QueryStep *second_step = array_get(&self->steps, start_step_index + 1); + if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1 && !second_step->is_immediate) { wildcard_root_alternative_index = step->alternative_index; start_step_index += 1; step = second_step; @@ -2754,7 +2869,7 @@ TSQuery *ts_query_new( uint32_t start_depth = step->depth; bool is_rooted = start_depth == 0; for (uint32_t step_index = start_step_index + 1; step_index < self->steps.size; step_index++) { - QueryStep *child_step = &self->steps.contents[step_index]; + QueryStep *child_step = array_get(&self->steps, step_index); if (child_step->is_dead_end) break; if (child_step->depth == start_depth) { is_rooted = false; @@ -2858,19 +2973,24 @@ const TSQueryPredicateStep *ts_query_predicates_for_pattern( uint32_t pattern_index, uint32_t *step_count ) { - Slice slice = self->patterns.contents[pattern_index].predicate_steps; + Slice slice = array_get(&self->patterns, pattern_index)->predicate_steps; *step_count = slice.length; - if (self->predicate_steps.contents == NULL) { - return NULL; - } - return &self->predicate_steps.contents[slice.offset]; + if (slice.length == 0) return NULL; + return array_get(&self->predicate_steps, slice.offset); } uint32_t ts_query_start_byte_for_pattern( const TSQuery *self, uint32_t pattern_index ) { - return self->patterns.contents[pattern_index].start_byte; + return array_get(&self->patterns, pattern_index)->start_byte; +} + +uint32_t ts_query_end_byte_for_pattern( + const TSQuery *self, + uint32_t pattern_index +) { + return array_get(&self->patterns, pattern_index)->end_byte; } bool ts_query_is_pattern_rooted( @@ -2878,7 +2998,7 @@ bool ts_query_is_pattern_rooted( uint32_t pattern_index ) { for (unsigned i = 0; i < self->pattern_map.size; i++) { - PatternEntry *entry = &self->pattern_map.contents[i]; + PatternEntry *entry = array_get(&self->pattern_map, i); if (entry->pattern_index == pattern_index) { if (!entry->is_rooted) return false; } @@ -2891,7 +3011,7 @@ bool ts_query_is_pattern_non_local( uint32_t pattern_index ) { if (pattern_index < self->patterns.size) { - return self->patterns.contents[pattern_index].is_non_local; + return array_get(&self->patterns, pattern_index)->is_non_local; } else { return false; } @@ -2903,12 +3023,12 @@ bool ts_query_is_pattern_guaranteed_at_step( ) { uint32_t step_index = UINT32_MAX; for (unsigned i = 0; i < self->step_offsets.size; i++) { - StepOffset *step_offset = &self->step_offsets.contents[i]; + StepOffset *step_offset = array_get(&self->step_offsets, i); if (step_offset->byte_offset > byte_offset) break; step_index = step_offset->step_index; } if (step_index < self->steps.size) { - return self->steps.contents[step_index].root_pattern_guaranteed; + return array_get(&self->steps, step_index)->root_pattern_guaranteed; } else { return false; } @@ -2918,13 +3038,13 @@ bool ts_query__step_is_fallible( const TSQuery *self, uint16_t step_index ) { - assert((uint32_t)step_index + 1 < self->steps.size); - QueryStep *step = &self->steps.contents[step_index]; - QueryStep *next_step = &self->steps.contents[step_index + 1]; + ts_assert((uint32_t)step_index + 1 < self->steps.size); + QueryStep *step = array_get(&self->steps, step_index); + QueryStep *next_step = array_get(&self->steps, step_index + 1); return ( next_step->depth != PATTERN_DONE_MARKER && next_step->depth > step->depth && - !next_step->parent_pattern_guaranteed + (!next_step->parent_pattern_guaranteed || step->symbol == WILDCARD_SYMBOL) ); } @@ -2938,7 +3058,7 @@ void ts_query_disable_capture( int id = symbol_table_id_for_name(&self->captures, name, length); if (id != -1) { for (unsigned i = 0; i < self->steps.size; i++) { - QueryStep *step = &self->steps.contents[i]; + QueryStep *step = array_get(&self->steps, i); query_step__remove_capture(step, id); } } @@ -2951,7 +3071,7 @@ void ts_query_disable_pattern( // Remove the given pattern from the pattern map. Its steps will still // be in the `steps` array, but they will never be read. for (unsigned i = 0; i < self->pattern_map.size; i++) { - PatternEntry *pattern = &self->pattern_map.contents[i]; + PatternEntry *pattern = array_get(&self->pattern_map, i); if (pattern->pattern_index == pattern_index) { array_erase(&self->pattern_map, i); i--; @@ -2977,6 +3097,9 @@ TSQueryCursor *ts_query_cursor_new(void) { .start_point = {0, 0}, .end_point = POINT_MAX, .max_start_depth = UINT32_MAX, + .timeout_duration = 0, + .end_clock = clock_null(), + .operation_count = 0, }; array_reserve(&self->states, 8); array_reserve(&self->finished_states, 8); @@ -3003,6 +3126,14 @@ void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) { self->capture_list_pool.max_capture_list_count = limit; } +uint64_t ts_query_cursor_timeout_micros(const TSQueryCursor *self) { + return duration_to_micros(self->timeout_duration); +} + +void ts_query_cursor_set_timeout_micros(TSQueryCursor *self, uint64_t timeout_micros) { + self->timeout_duration = duration_from_micros(timeout_micros); +} + #ifdef DEBUG_EXECUTE_QUERY #define LOG(...) fprintf(stderr, __VA_ARGS__) #else @@ -3014,10 +3145,10 @@ void ts_query_cursor_exec( const TSQuery *query, TSNode node ) { - if (query) { + if (query) { LOG("query steps:\n"); for (unsigned i = 0; i < query->steps.size; i++) { - QueryStep *step = &query->steps.contents[i]; + QueryStep *step = array_get(&query->steps, i); LOG(" %u: {", i); if (step->depth == PATTERN_DONE_MARKER) { LOG("DONE"); @@ -3051,9 +3182,32 @@ void ts_query_cursor_exec( self->halted = false; self->query = query; self->did_exceed_match_limit = false; + self->operation_count = 0; + if (self->timeout_duration) { + self->end_clock = clock_after(clock_now(), self->timeout_duration); + } else { + self->end_clock = clock_null(); + } + self->query_options = NULL; + self->query_state = (TSQueryCursorState) {0}; +} + +void ts_query_cursor_exec_with_options( + TSQueryCursor *self, + const TSQuery *query, + TSNode node, + const TSQueryCursorOptions *query_options +) { + ts_query_cursor_exec(self, query, node); + if (query_options) { + self->query_options = query_options; + self->query_state = (TSQueryCursorState) { + .payload = query_options->payload + }; + } } -void ts_query_cursor_set_byte_range( +bool ts_query_cursor_set_byte_range( TSQueryCursor *self, uint32_t start_byte, uint32_t end_byte @@ -3061,11 +3215,15 @@ void ts_query_cursor_set_byte_range( if (end_byte == 0) { end_byte = UINT32_MAX; } + if (start_byte > end_byte) { + return false; + } self->start_byte = start_byte; self->end_byte = end_byte; + return true; } -void ts_query_cursor_set_point_range( +bool ts_query_cursor_set_point_range( TSQueryCursor *self, TSPoint start_point, TSPoint end_point @@ -3073,8 +3231,12 @@ void ts_query_cursor_set_point_range( if (end_point.row == 0 && end_point.column == 0) { end_point = POINT_MAX; } + if (point_gt(start_point, end_point)) { + return false; + } self->start_point = start_point; self->end_point = end_point; + return true; } // Search through all of the in-progress states, and find the captured @@ -3084,14 +3246,14 @@ static bool ts_query_cursor__first_in_progress_capture( uint32_t *state_index, uint32_t *byte_offset, uint32_t *pattern_index, - bool *root_pattern_guaranteed + bool *is_definite ) { bool result = false; *state_index = UINT32_MAX; *byte_offset = UINT32_MAX; *pattern_index = UINT32_MAX; for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; + QueryState *state = array_get(&self->states, i); if (state->dead) continue; const CaptureList *captures = capture_list_pool_get( @@ -3102,7 +3264,7 @@ static bool ts_query_cursor__first_in_progress_capture( continue; } - TSNode node = captures->contents[state->consumed_capture_count].node; + TSNode node = array_get(captures, state->consumed_capture_count)->node; if ( ts_node_end_byte(node) <= self->start_byte || point_lte(ts_node_end_point(node), self->start_point) @@ -3118,9 +3280,12 @@ static bool ts_query_cursor__first_in_progress_capture( node_start_byte < *byte_offset || (node_start_byte == *byte_offset && state->pattern_index < *pattern_index) ) { - QueryStep *step = &self->query->steps.contents[state->step_index]; - if (root_pattern_guaranteed) { - *root_pattern_guaranteed = step->root_pattern_guaranteed; + QueryStep *step = array_get(&self->query->steps, state->step_index); + if (is_definite) { + // We're being a bit conservative here by asserting that the following step + // is not immediate, because this capture might end up being discarded if the + // following symbol in the tree isn't the required symbol for this step. + *is_definite = step->root_pattern_guaranteed && !step->is_immediate; } else if (step->root_pattern_guaranteed) { continue; } @@ -3171,8 +3336,8 @@ void ts_query_cursor__compare_captures( for (;;) { if (i < left_captures->size) { if (j < right_captures->size) { - TSQueryCapture *left = &left_captures->contents[i]; - TSQueryCapture *right = &right_captures->contents[j]; + TSQueryCapture *left = array_get(left_captures, i); + TSQueryCapture *right = array_get(right_captures, j); if (left->node.id == right->node.id && left->index == right->index) { i++; j++; @@ -3211,7 +3376,7 @@ static void ts_query_cursor__add_state( TSQueryCursor *self, const PatternEntry *pattern ) { - QueryStep *step = &self->query->steps.contents[pattern->step_index]; + QueryStep *step = array_get(&self->query->steps, pattern->step_index); uint32_t start_depth = self->depth - step->depth; // Keep the states array in ascending order of start_depth and pattern_index, @@ -3235,7 +3400,7 @@ static void ts_query_cursor__add_state( // need to execute in order to keep the states ordered by pattern_index. uint32_t index = self->states.size; while (index > 0) { - QueryState *prev_state = &self->states.contents[index - 1]; + QueryState *prev_state = array_get(&self->states, index - 1); if (prev_state->start_depth < start_depth) break; if (prev_state->start_depth == start_depth) { // Avoid inserting an unnecessary duplicate state, which would be @@ -3299,7 +3464,7 @@ static CaptureList *ts_query_cursor__prepare_to_capture( " abandon state. index:%u, pattern:%u, offset:%u.\n", state_index, pattern_index, byte_offset ); - QueryState *other_state = &self->states.contents[state_index]; + QueryState *other_state = array_get(&self->states, state_index); state->capture_list_id = other_state->capture_list_id; other_state->capture_list_id = NONE; other_state->dead = true; @@ -3369,8 +3534,8 @@ static QueryState *ts_query_cursor__copy_state( } array_insert(&self->states, state_index + 1, copy); - *state_ref = &self->states.contents[state_index]; - return &self->states.contents[state_index + 1]; + *state_ref = array_get(&self->states, state_index); + return array_get(&self->states, state_index + 1); } static inline bool ts_query_cursor__should_descend( @@ -3385,8 +3550,8 @@ static inline bool ts_query_cursor__should_descend( // If there are in-progress matches whose remaining steps occur // deeper in the tree, then descend. for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i];; - QueryStep *next_step = &self->query->steps.contents[state->step_index]; + QueryState *state = array_get(&self->states, i); + QueryStep *next_step = array_get(&self->query->steps, state->step_index); if ( next_step->depth != PATTERN_DONE_MARKER && state->start_depth + next_step->depth > self->depth @@ -3447,7 +3612,26 @@ static inline bool ts_query_cursor__advance( } } - if (did_match || self->halted) return did_match; + if (++self->operation_count == OP_COUNT_PER_QUERY_TIMEOUT_CHECK) { + self->operation_count = 0; + } + + if (self->query_options && self->query_options->progress_callback) { + self->query_state.current_byte_offset = ts_node_start_byte(ts_tree_cursor_current_node(&self->cursor)); + } + if ( + did_match || + self->halted || + ( + self->operation_count == 0 && + ( + (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock)) || + (self->query_options && self->query_options->progress_callback && self->query_options->progress_callback(&self->query_state)) + ) + ) + ) { + return did_match; + } // Exit the current node. if (self->ascending) { @@ -3461,8 +3645,8 @@ static inline bool ts_query_cursor__advance( // After leaving a node, remove any states that cannot make further progress. uint32_t deleted_count = 0; for (unsigned i = 0, n = self->states.size; i < n; i++) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; + QueryState *state = array_get(&self->states, i); + QueryStep *step = array_get(&self->query->steps, state->step_index); // If a state completed its pattern inside of this node, but was deferred from finishing // in order to search for longer matches, mark it as finished. @@ -3495,7 +3679,7 @@ static inline bool ts_query_cursor__advance( } else if (deleted_count > 0) { - self->states.contents[i - deleted_count] = *state; + *array_get(&self->states, i - deleted_count) = *state; } } self->states.size -= deleted_count; @@ -3532,6 +3716,13 @@ static inline bool ts_query_cursor__advance( // Get the properties of the current node. TSNode node = ts_tree_cursor_current_node(&self->cursor); TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor); + + uint32_t start_byte = ts_node_start_byte(node); + uint32_t end_byte = ts_node_end_byte(node); + TSPoint start_point = ts_node_start_point(node); + TSPoint end_point = ts_node_end_point(node); + bool is_empty = start_byte == end_byte; + bool parent_precedes_range = !ts_node_is_null(parent_node) && ( ts_node_end_byte(parent_node) <= self->start_byte || point_lte(ts_node_end_point(parent_node), self->start_point) @@ -3540,13 +3731,16 @@ static inline bool ts_query_cursor__advance( ts_node_start_byte(parent_node) >= self->end_byte || point_gte(ts_node_start_point(parent_node), self->end_point) ); - bool node_precedes_range = parent_precedes_range || ( - ts_node_end_byte(node) <= self->start_byte || - point_lte(ts_node_end_point(node), self->start_point) - ); + bool node_precedes_range = + parent_precedes_range || + end_byte < self->start_byte || + point_lt(end_point, self->start_point) || + (!is_empty && end_byte == self->start_byte) || + (!is_empty && point_eq(end_point, self->start_point)); + bool node_follows_range = parent_follows_range || ( - ts_node_start_byte(node) >= self->end_byte || - point_gte(ts_node_start_point(node), self->end_point) + start_byte >= self->end_byte || + point_gte(start_point, self->end_point) ); bool parent_intersects_range = !parent_precedes_range && !parent_follows_range; bool node_intersects_range = !node_precedes_range && !node_follows_range; @@ -3554,6 +3748,7 @@ static inline bool ts_query_cursor__advance( if (self->on_visible_node) { TSSymbol symbol = ts_node_symbol(node); bool is_named = ts_node_is_named(node); + bool is_missing = ts_node_is_missing(node); bool has_later_siblings; bool has_later_named_siblings; bool can_have_later_siblings_with_this_field; @@ -3587,11 +3782,11 @@ static inline bool ts_query_cursor__advance( // Add new states for any patterns whose root node is a wildcard. if (!node_is_error) { for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; + PatternEntry *pattern = array_get(&self->query->pattern_map, i); // If this node matches the first step of the pattern, then add a new // state at the start of this pattern. - QueryStep *step = &self->query->steps.contents[pattern->step_index]; + QueryStep *step = array_get(&self->query->steps, pattern->step_index); uint32_t start_depth = self->depth - step->depth; if ( (pattern->is_rooted ? @@ -3609,9 +3804,9 @@ static inline bool ts_query_cursor__advance( // Add new states for any patterns whose root node matches this node. unsigned i; if (ts_query__pattern_map_search(self->query, symbol, &i)) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; + PatternEntry *pattern = array_get(&self->query->pattern_map, i); - QueryStep *step = &self->query->steps.contents[pattern->step_index]; + QueryStep *step = array_get(&self->query->steps, pattern->step_index); uint32_t start_depth = self->depth - step->depth; do { // If this node matches the first step of the pattern, then add a new @@ -3629,15 +3824,15 @@ static inline bool ts_query_cursor__advance( // Advance to the next pattern whose root node matches this node. i++; if (i == self->query->pattern_map.size) break; - pattern = &self->query->pattern_map.contents[i]; - step = &self->query->steps.contents[pattern->step_index]; + pattern = array_get(&self->query->pattern_map, i); + step = array_get(&self->query->steps, pattern->step_index); } while (step->symbol == symbol); } // Update all of the in-progress states with current node. for (unsigned j = 0, copy_count = 0; j < self->states.size; j += 1 + copy_count) { - QueryState *state = &self->states.contents[j]; - QueryStep *step = &self->query->steps.contents[state->step_index]; + QueryState *state = array_get(&self->states, j); + QueryStep *step = array_get(&self->query->steps, state->step_index); state->has_in_progress_alternatives = false; copy_count = 0; @@ -3650,9 +3845,13 @@ static inline bool ts_query_cursor__advance( // pattern. bool node_does_match = false; if (step->symbol == WILDCARD_SYMBOL) { - node_does_match = !node_is_error && (is_named || !step->is_named); + if (step->is_missing) { + node_does_match = is_missing; + } else { + node_does_match = !node_is_error && (is_named || !step->is_named); + } } else { - node_does_match = symbol == step->symbol; + node_does_match = symbol == step->symbol && (!step->is_missing || is_missing); } bool later_sibling_can_match = has_later_siblings; if ((step->is_immediate && is_named) || state->seeking_immediate_match) { @@ -3682,7 +3881,7 @@ static inline bool ts_query_cursor__advance( } if (step->negated_field_list_id) { - TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id]; + TSFieldId *negated_field_ids = array_get(&self->query->negated_fields, step->negated_field_list_id); for (;;) { TSFieldId negated_field_id = *negated_field_ids; if (negated_field_id) { @@ -3777,14 +3976,28 @@ static inline bool ts_query_cursor__advance( // Advance this state to the next step of its pattern. state->step_index++; - state->seeking_immediate_match = false; LOG( " advance state. pattern:%u, step:%u\n", state->pattern_index, state->step_index ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; + QueryStep *next_step = array_get(&self->query->steps, state->step_index); + + // For a given step, if the current symbol is the wildcard symbol, `_`, and it is **not** + // named, meaning it should capture anonymous nodes, **and** the next step is immediate, + // we reuse the `seeking_immediate_match` flag to indicate that we are looking for an + // immediate match due to an unnamed wildcard symbol. + // + // The reason for this is that typically, anchors will not consider anonymous nodes, + // but we're special casing the wildcard symbol to allow for any immediate matches, + // regardless of whether they are named or not. + if (step->symbol == WILDCARD_SYMBOL && !step->is_named && next_step->is_immediate) { + state->seeking_immediate_match = true; + } else { + state->seeking_immediate_match = false; + } + if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true; // If this state's next step has an alternative step, then copy the state in order @@ -3792,8 +4005,8 @@ static inline bool ts_query_cursor__advance( // so this is an interactive process. unsigned end_index = j + 1; for (unsigned k = j; k < end_index; k++) { - QueryState *child_state = &self->states.contents[k]; - QueryStep *child_step = &self->query->steps.contents[child_state->step_index]; + QueryState *child_state = array_get(&self->states, k); + QueryStep *child_step = array_get(&self->query->steps, child_state->step_index); if (child_step->alternative_index != NONE) { // A "dead-end" step exists only to add a non-sequential jump into the step sequence, // via its alternative index. When a state reaches a dead-end step, it jumps straight @@ -3834,7 +4047,7 @@ static inline bool ts_query_cursor__advance( } for (unsigned j = 0; j < self->states.size; j++) { - QueryState *state = &self->states.contents[j]; + QueryState *state = array_get(&self->states, j); if (state->dead) { array_erase(&self->states, j); j--; @@ -3846,7 +4059,7 @@ static inline bool ts_query_cursor__advance( // one state has a strict subset of another state's captures. bool did_remove = false; for (unsigned k = j + 1; k < self->states.size; k++) { - QueryState *other_state = &self->states.contents[k]; + QueryState *other_state = array_get(&self->states, k); // Query states are kept in ascending order of start_depth and pattern_index. // Since the longest-match criteria is only used for deduping matches of the same @@ -3906,7 +4119,7 @@ static inline bool ts_query_cursor__advance( state->step_index, capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; + QueryStep *next_step = array_get(&self->query->steps, state->step_index); if (next_step->depth == PATTERN_DONE_MARKER) { if (state->has_in_progress_alternatives) { LOG(" defer finishing pattern %u\n", state->pattern_index); @@ -3951,7 +4164,7 @@ bool ts_query_cursor_next_match( } } - QueryState *state = &self->finished_states.contents[0]; + QueryState *state = array_get(&self->finished_states, 0); if (state->id == UINT32_MAX) state->id = self->next_state_id++; match->id = state->id; match->pattern_index = state->pattern_index; @@ -3971,7 +4184,7 @@ void ts_query_cursor_remove_match( uint32_t match_id ) { for (unsigned i = 0; i < self->finished_states.size; i++) { - const QueryState *state = &self->finished_states.contents[i]; + const QueryState *state = array_get(&self->finished_states, i); if (state->id == match_id) { capture_list_pool_release( &self->capture_list_pool, @@ -3985,7 +4198,7 @@ void ts_query_cursor_remove_match( // Remove unfinished query states as well to prevent future // captures for a match being removed. for (unsigned i = 0; i < self->states.size; i++) { - const QueryState *state = &self->states.contents[i]; + const QueryState *state = array_get(&self->states, i); if (state->id == match_id) { capture_list_pool_release( &self->capture_list_pool, @@ -4011,7 +4224,7 @@ bool ts_query_cursor_next_capture( uint32_t first_unfinished_pattern_index; uint32_t first_unfinished_state_index; bool first_unfinished_state_is_definite = false; - ts_query_cursor__first_in_progress_capture( + bool found_unfinished_state = ts_query_cursor__first_in_progress_capture( self, &first_unfinished_state_index, &first_unfinished_capture_byte, @@ -4025,7 +4238,7 @@ bool ts_query_cursor_next_capture( uint32_t first_finished_capture_byte = first_unfinished_capture_byte; uint32_t first_finished_pattern_index = first_unfinished_pattern_index; for (unsigned i = 0; i < self->finished_states.size;) { - QueryState *state = &self->finished_states.contents[i]; + QueryState *state = array_get(&self->finished_states, i); const CaptureList *captures = capture_list_pool_get( &self->capture_list_pool, state->capture_list_id @@ -4041,7 +4254,7 @@ bool ts_query_cursor_next_capture( continue; } - TSNode node = captures->contents[state->consumed_capture_count].node; + TSNode node = array_get(captures, state->consumed_capture_count)->node; bool node_precedes_range = ( ts_node_end_byte(node) <= self->start_byte || @@ -4081,7 +4294,7 @@ bool ts_query_cursor_next_capture( if (first_finished_state) { state = first_finished_state; } else if (first_unfinished_state_is_definite) { - state = &self->states.contents[first_unfinished_state_index]; + state = array_get(&self->states, first_unfinished_state_index); } else { state = NULL; } @@ -4101,7 +4314,7 @@ bool ts_query_cursor_next_capture( return true; } - if (capture_list_pool_is_empty(&self->capture_list_pool)) { + if (capture_list_pool_is_empty(&self->capture_list_pool) && found_unfinished_state) { LOG( " abandon state. index:%u, pattern:%u, offset:%u.\n", first_unfinished_state_index, @@ -4110,7 +4323,7 @@ bool ts_query_cursor_next_capture( ); capture_list_pool_release( &self->capture_list_pool, - self->states.contents[first_unfinished_state_index].capture_list_id + array_get(&self->states, first_unfinished_state_index)->capture_list_id ); array_erase(&self->states, first_unfinished_state_index); } diff --git a/stack.c b/stack.c index 98d8c561..91420074 100644 --- a/stack.c +++ b/stack.c @@ -82,9 +82,9 @@ typedef StackAction (*StackCallback)(void *, const StackIterator *); static void stack_node_retain(StackNode *self) { if (!self) return; - assert(self->ref_count > 0); + ts_assert(self->ref_count > 0); self->ref_count++; - assert(self->ref_count != 0); + ts_assert(self->ref_count != 0); } static void stack_node_release( @@ -93,7 +93,7 @@ static void stack_node_release( SubtreePool *subtree_pool ) { recur: - assert(self->ref_count != 0); + ts_assert(self->ref_count != 0); self->ref_count--; if (self->ref_count > 0) return; @@ -290,8 +290,8 @@ static StackVersion ts_stack__add_version( ) { StackHead head = { .node = node, - .node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error, - .last_external_token = self->heads.contents[original_version].last_external_token, + .node_count_at_last_error = array_get(&self->heads, original_version)->node_count_at_last_error, + .last_external_token = array_get(&self->heads, original_version)->last_external_token, .status = StackStatusActive, .lookahead_when_paused = NULL_SUBTREE, }; @@ -308,8 +308,8 @@ static void ts_stack__add_slice( SubtreeArray *subtrees ) { for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { - StackVersion version = self->slices.contents[i].version; - if (self->heads.contents[version].node == node) { + StackVersion version = array_get(&self->slices, i)->version; + if (array_get(&self->heads, version)->node == node) { StackSlice slice = {*subtrees, version}; array_insert(&self->slices, i + 1, slice); return; @@ -349,7 +349,7 @@ static StackSliceArray stack__iter( while (self->iterators.size > 0) { for (uint32_t i = 0, size = self->iterators.size; i < size; i++) { - StackIterator *iterator = &self->iterators.contents[i]; + StackIterator *iterator = array_get(&self->iterators, i); StackNode *node = iterator->node; StackAction action = callback(payload, iterator); @@ -384,11 +384,11 @@ static StackSliceArray stack__iter( StackLink link; if (j == node->link_count) { link = node->links[0]; - next_iterator = &self->iterators.contents[i]; + next_iterator = array_get(&self->iterators, i); } else { if (self->iterators.size >= MAX_ITERATOR_COUNT) continue; link = node->links[j]; - StackIterator current_iterator = self->iterators.contents[i]; + StackIterator current_iterator = *array_get(&self->iterators, i); array_push(&self->iterators, current_iterator); next_iterator = array_back(&self->iterators); ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees); @@ -444,12 +444,12 @@ void ts_stack_delete(Stack *self) { array_delete(&self->iterators); stack_node_release(self->base_node, &self->node_pool, self->subtree_pool); for (uint32_t i = 0; i < self->heads.size; i++) { - stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); + stack_head_delete(array_get(&self->heads, i), &self->node_pool, self->subtree_pool); } array_clear(&self->heads); if (self->node_pool.contents) { for (uint32_t i = 0; i < self->node_pool.size; i++) - ts_free(self->node_pool.contents[i]); + ts_free(*array_get(&self->node_pool, i)); array_delete(&self->node_pool); } array_delete(&self->heads); @@ -460,6 +460,17 @@ uint32_t ts_stack_version_count(const Stack *self) { return self->heads.size; } +uint32_t ts_stack_halted_version_count(Stack *self) { + uint32_t count = 0; + for (uint32_t i = 0; i < self->heads.size; i++) { + StackHead *head = array_get(&self->heads, i); + if (head->status == StackStatusHalted) { + count++; + } + } + return count; +} + TSStateId ts_stack_state(const Stack *self, StackVersion version) { return array_get(&self->heads, version)->node->state; } @@ -524,6 +535,7 @@ StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t c return stack__iter(self, version, pop_count_callback, &count, (int)count); } + forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { (void)payload; if (iterator->subtree_count >= 1) { @@ -540,8 +552,8 @@ forceinline StackAction pop_pending_callback(void *payload, const StackIterator StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) { StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0); if (pop.size > 0) { - ts_stack_renumber_version(self, pop.contents[0].version, version); - pop.contents[0].version = version; + ts_stack_renumber_version(self, array_get(&pop, 0)->version, version); + array_get(&pop, 0)->version = version; } return pop; } @@ -549,7 +561,7 @@ StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) { forceinline StackAction pop_error_callback(void *payload, const StackIterator *iterator) { if (iterator->subtrees.size > 0) { bool *found_error = payload; - if (!*found_error && ts_subtree_is_error(iterator->subtrees.contents[0])) { + if (!*found_error && ts_subtree_is_error(*array_get(&iterator->subtrees, 0))) { *found_error = true; return StackActionPop | StackActionStop; } else { @@ -567,9 +579,9 @@ SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) { bool found_error = false; StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1); if (pop.size > 0) { - assert(pop.size == 1); - ts_stack_renumber_version(self, pop.contents[0].version, version); - return pop.contents[0].subtrees; + ts_assert(pop.size == 1); + ts_stack_renumber_version(self, array_get(&pop, 0)->version, version); + return array_get(&pop, 0)->subtrees; } break; } @@ -597,7 +609,7 @@ forceinline StackAction summarize_stack_callback(void *payload, const StackItera unsigned depth = iterator->subtree_count; if (depth > session->max_depth) return StackActionStop; for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) { - StackSummaryEntry entry = session->summary->contents[i]; + StackSummaryEntry entry = *array_get(session->summary, i); if (entry.depth < depth) break; if (entry.depth == depth && entry.state == state) return StackActionNone; } @@ -616,7 +628,7 @@ void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_dep }; array_init(session.summary); stack__iter(self, version, summarize_stack_callback, &session, -1); - StackHead *head = &self->heads.contents[version]; + StackHead *head = array_get(&self->heads, version); if (head->summary) { array_delete(head->summary); ts_free(head->summary); @@ -663,10 +675,10 @@ void ts_stack_remove_version(Stack *self, StackVersion version) { void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) { if (v1 == v2) return; - assert(v2 < v1); - assert((uint32_t)v1 < self->heads.size); - StackHead *source_head = &self->heads.contents[v1]; - StackHead *target_head = &self->heads.contents[v2]; + ts_assert(v2 < v1); + ts_assert((uint32_t)v1 < self->heads.size); + StackHead *source_head = array_get(&self->heads, v1); + StackHead *target_head = array_get(&self->heads, v2); if (target_head->summary && !source_head->summary) { source_head->summary = target_head->summary; target_head->summary = NULL; @@ -677,14 +689,15 @@ void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) { } void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) { - StackHead temporary_head = self->heads.contents[v1]; - self->heads.contents[v1] = self->heads.contents[v2]; - self->heads.contents[v2] = temporary_head; + StackHead temporary_head = *array_get(&self->heads, v1); + *array_get(&self->heads, v1) = *array_get(&self->heads, v2); + *array_get(&self->heads, v2) = temporary_head; } StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { - assert(version < self->heads.size); - array_push(&self->heads, self->heads.contents[version]); + ts_assert(version < self->heads.size); + StackHead version_head = *array_get(&self->heads, version); + array_push(&self->heads, version_head); StackHead *head = array_back(&self->heads); stack_node_retain(head->node); if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token); @@ -694,8 +707,8 @@ StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) { if (!ts_stack_can_merge(self, version1, version2)) return false; - StackHead *head1 = &self->heads.contents[version1]; - StackHead *head2 = &self->heads.contents[version2]; + StackHead *head1 = array_get(&self->heads, version1); + StackHead *head2 = array_get(&self->heads, version2); for (uint32_t i = 0; i < head2->node->link_count; i++) { stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool); } @@ -707,8 +720,8 @@ bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) { } bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) { - StackHead *head1 = &self->heads.contents[version1]; - StackHead *head2 = &self->heads.contents[version2]; + StackHead *head1 = array_get(&self->heads, version1); + StackHead *head2 = array_get(&self->heads, version2); return head1->status == StackStatusActive && head2->status == StackStatusActive && @@ -743,7 +756,7 @@ bool ts_stack_is_paused(const Stack *self, StackVersion version) { Subtree ts_stack_resume(Stack *self, StackVersion version) { StackHead *head = array_get(&self->heads, version); - assert(head->status == StackStatusPaused); + ts_assert(head->status == StackStatusPaused); Subtree result = head->lookahead_when_paused; head->status = StackStatusActive; head->lookahead_when_paused = NULL_SUBTREE; @@ -753,7 +766,7 @@ Subtree ts_stack_resume(Stack *self, StackVersion version) { void ts_stack_clear(Stack *self) { stack_node_retain(self->base_node); for (uint32_t i = 0; i < self->heads.size; i++) { - stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); + stack_head_delete(array_get(&self->heads, i), &self->node_pool, self->subtree_pool); } array_clear(&self->heads); array_push(&self->heads, ((StackHead) { @@ -776,7 +789,7 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) array_clear(&self->iterators); for (uint32_t i = 0; i < self->heads.size; i++) { - StackHead *head = &self->heads.contents[i]; + StackHead *head = array_get(&self->heads, i); if (head->status == StackStatusHalted) continue; fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i); @@ -794,7 +807,7 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) if (head->summary) { fprintf(f, "\nsummary:"); - for (uint32_t j = 0; j < head->summary->size; j++) fprintf(f, " %u", head->summary->contents[j].state); + for (uint32_t j = 0; j < head->summary->size; j++) fprintf(f, " %u", array_get(head->summary, j)->state); } if (head->last_external_token.ptr) { @@ -815,11 +828,11 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) all_iterators_done = true; for (uint32_t i = 0; i < self->iterators.size; i++) { - StackIterator iterator = self->iterators.contents[i]; + StackIterator iterator = *array_get(&self->iterators, i); StackNode *node = iterator.node; for (uint32_t j = 0; j < visited_nodes.size; j++) { - if (visited_nodes.contents[j] == node) { + if (*array_get(&visited_nodes, j) == node) { node = NULL; break; } @@ -878,7 +891,7 @@ bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) StackIterator *next_iterator; if (j == 0) { - next_iterator = &self->iterators.contents[i]; + next_iterator = array_get(&self->iterators, i); } else { array_push(&self->iterators, iterator); next_iterator = array_back(&self->iterators); diff --git a/stack.h b/stack.h index 86abbc9d..2619f1e8 100644 --- a/stack.h +++ b/stack.h @@ -7,7 +7,6 @@ extern "C" { #include "./array.h" #include "./subtree.h" -#include "./error_costs.h" #include typedef struct Stack Stack; @@ -29,23 +28,26 @@ typedef struct { typedef Array(StackSummaryEntry) StackSummary; // Create a stack. -Stack *ts_stack_new(SubtreePool *); +Stack *ts_stack_new(SubtreePool *subtree_pool); // Release the memory reserved for a given stack. -void ts_stack_delete(Stack *); +void ts_stack_delete(Stack *self); // Get the stack's current number of versions. -uint32_t ts_stack_version_count(const Stack *); +uint32_t ts_stack_version_count(const Stack *self); + +// Get the stack's current number of halted versions. +uint32_t ts_stack_halted_version_count(Stack *self); // Get the state at the top of the given version of the stack. If the stack is // empty, this returns the initial state, 0. -TSStateId ts_stack_state(const Stack *, StackVersion); +TSStateId ts_stack_state(const Stack *self, StackVersion version); // Get the last external token associated with a given version of the stack. -Subtree ts_stack_last_external_token(const Stack *, StackVersion); +Subtree ts_stack_last_external_token(const Stack *self, StackVersion version); // Set the last external token associated with a given version of the stack. -void ts_stack_set_last_external_token(Stack *, StackVersion, Subtree ); +void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token); // Get the position of the given version of the stack within the document. Length ts_stack_position(const Stack *, StackVersion); @@ -55,76 +57,74 @@ Length ts_stack_position(const Stack *, StackVersion); // This transfers ownership of the tree to the Stack. Callers that // need to retain ownership of the tree for their own purposes should // first retain the tree. -void ts_stack_push(Stack *, StackVersion, Subtree , bool, TSStateId); +void ts_stack_push(Stack *self, StackVersion version, Subtree subtree, bool pending, TSStateId state); // Pop the given number of entries from the given version of the stack. This // operation can increase the number of stack versions by revealing multiple // versions which had previously been merged. It returns an array that // specifies the index of each revealed version and the trees that were // removed from that version. -StackSliceArray ts_stack_pop_count(Stack *, StackVersion, uint32_t count); +StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count); // Remove an error at the top of the given version of the stack. -SubtreeArray ts_stack_pop_error(Stack *, StackVersion); +SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version); // Remove any pending trees from the top of the given version of the stack. -StackSliceArray ts_stack_pop_pending(Stack *, StackVersion); +StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version); -// Remove any all trees from the given version of the stack. -StackSliceArray ts_stack_pop_all(Stack *, StackVersion); +// Remove all trees from the given version of the stack. +StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version); // Get the maximum number of tree nodes reachable from this version of the stack // since the last error was detected. -unsigned ts_stack_node_count_since_error(const Stack *, StackVersion); +unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version); -int ts_stack_dynamic_precedence(Stack *, StackVersion); +int ts_stack_dynamic_precedence(Stack *self, StackVersion version); -bool ts_stack_has_advanced_since_error(const Stack *, StackVersion); +bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version); // Compute a summary of all the parse states near the top of the given // version of the stack and store the summary for later retrieval. -void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); +void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth); // Retrieve a summary of all the parse states near the top of the // given version of the stack. -StackSummary *ts_stack_get_summary(Stack *, StackVersion); +StackSummary *ts_stack_get_summary(Stack *self, StackVersion version); // Get the total cost of all errors on the given version of the stack. -unsigned ts_stack_error_cost(const Stack *, StackVersion version); +unsigned ts_stack_error_cost(const Stack *self, StackVersion version); // Merge the given two stack versions if possible, returning true // if they were successfully merged and false otherwise. -bool ts_stack_merge(Stack *, StackVersion, StackVersion); +bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2); // Determine whether the given two stack versions can be merged. -bool ts_stack_can_merge(Stack *, StackVersion, StackVersion); +bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2); -Subtree ts_stack_resume(Stack *, StackVersion); +Subtree ts_stack_resume(Stack *self, StackVersion version); -void ts_stack_pause(Stack *, StackVersion, Subtree); +void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead); -void ts_stack_halt(Stack *, StackVersion); +void ts_stack_halt(Stack *self, StackVersion version); -bool ts_stack_is_active(const Stack *, StackVersion); +bool ts_stack_is_active(const Stack *self, StackVersion version); -bool ts_stack_is_paused(const Stack *, StackVersion); +bool ts_stack_is_paused(const Stack *self, StackVersion version); -bool ts_stack_is_halted(const Stack *, StackVersion); +bool ts_stack_is_halted(const Stack *self, StackVersion version); -void ts_stack_renumber_version(Stack *, StackVersion, StackVersion); +void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2); -void ts_stack_swap_versions(Stack *, StackVersion, StackVersion); +void ts_stack_swap_versions(Stack *, StackVersion v1, StackVersion v2); -StackVersion ts_stack_copy_version(Stack *, StackVersion); +StackVersion ts_stack_copy_version(Stack *self, StackVersion version); // Remove the given version from the stack. -void ts_stack_remove_version(Stack *, StackVersion); - -void ts_stack_clear(Stack *); +void ts_stack_remove_version(Stack *self, StackVersion version); -bool ts_stack_print_dot_graph(Stack *, const TSLanguage *, FILE *); +void ts_stack_clear(Stack *self); -typedef void (*StackIterateCallback)(void *, TSStateId, uint32_t); +bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f); #ifdef __cplusplus } diff --git a/subtree.c b/subtree.c index 4524e182..97d55c86 100644 --- a/subtree.c +++ b/subtree.c @@ -1,4 +1,3 @@ -#include #include #include #include @@ -11,6 +10,7 @@ #include "./length.h" #include "./language.h" #include "./error_costs.h" +#include "./ts_assert.h" #include typedef struct { @@ -73,14 +73,14 @@ void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) { dest->contents = ts_calloc(self.capacity, sizeof(Subtree)); memcpy(dest->contents, self.contents, self.size * sizeof(Subtree)); for (uint32_t i = 0; i < self.size; i++) { - ts_subtree_retain(dest->contents[i]); + ts_subtree_retain(*array_get(dest, i)); } } } void ts_subtree_array_clear(SubtreePool *pool, SubtreeArray *self) { for (uint32_t i = 0; i < self->size; i++) { - ts_subtree_release(pool, self->contents[i]); + ts_subtree_release(pool, *array_get(self, i)); } array_clear(self); } @@ -96,7 +96,7 @@ void ts_subtree_array_remove_trailing_extras( ) { array_clear(destination); while (self->size > 0) { - Subtree last = self->contents[self->size - 1]; + Subtree last = *array_get(self, self->size - 1); if (ts_subtree_extra(last)) { self->size--; array_push(destination, last); @@ -110,9 +110,9 @@ void ts_subtree_array_remove_trailing_extras( void ts_subtree_array_reverse(SubtreeArray *self) { for (uint32_t i = 0, limit = self->size / 2; i < limit; i++) { size_t reverse_index = self->size - 1 - i; - Subtree swap = self->contents[i]; - self->contents[i] = self->contents[reverse_index]; - self->contents[reverse_index] = swap; + Subtree swap = *array_get(self, i); + *array_get(self, i) = *array_get(self, reverse_index); + *array_get(self, reverse_index) = swap; } } @@ -127,7 +127,7 @@ SubtreePool ts_subtree_pool_new(uint32_t capacity) { void ts_subtree_pool_delete(SubtreePool *self) { if (self->free_trees.contents) { for (unsigned i = 0; i < self->free_trees.size; i++) { - ts_free(self->free_trees.contents[i].ptr); + ts_free(array_get(&self->free_trees, i)->ptr); } array_delete(&self->free_trees); } @@ -157,6 +157,7 @@ static inline bool ts_subtree_can_inline(Length padding, Length size, uint32_t l padding.bytes < TS_MAX_INLINE_TREE_LENGTH && padding.extent.row < 16 && padding.extent.column < TS_MAX_INLINE_TREE_LENGTH && + size.bytes < TS_MAX_INLINE_TREE_LENGTH && size.extent.row == 0 && size.extent.column < TS_MAX_INLINE_TREE_LENGTH && lookahead_bytes < 16; @@ -229,7 +230,7 @@ void ts_subtree_set_symbol( ) { TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); if (self->data.is_inline) { - assert(symbol < UINT8_MAX); + ts_assert(symbol < UINT8_MAX); self->data.symbol = symbol; self->data.named = metadata.named; self->data.visible = metadata.visible; @@ -288,7 +289,7 @@ MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) { return result; } -static void ts_subtree__compress( +void ts_subtree_compress( MutableSubtree self, unsigned count, const TSLanguage *language, @@ -334,44 +335,12 @@ static void ts_subtree__compress( } } -void ts_subtree_balance(Subtree self, SubtreePool *pool, const TSLanguage *language) { - array_clear(&pool->tree_stack); - - if (ts_subtree_child_count(self) > 0 && self.ptr->ref_count == 1) { - array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); - } - - while (pool->tree_stack.size > 0) { - MutableSubtree tree = array_pop(&pool->tree_stack); - - if (tree.ptr->repeat_depth > 0) { - Subtree child1 = ts_subtree_children(tree)[0]; - Subtree child2 = ts_subtree_children(tree)[tree.ptr->child_count - 1]; - long repeat_delta = (long)ts_subtree_repeat_depth(child1) - (long)ts_subtree_repeat_depth(child2); - if (repeat_delta > 0) { - unsigned n = (unsigned)repeat_delta; - for (unsigned i = n / 2; i > 0; i /= 2) { - ts_subtree__compress(tree, i, language, &pool->tree_stack); - n -= i; - } - } - } - - for (uint32_t i = 0; i < tree.ptr->child_count; i++) { - Subtree child = ts_subtree_children(tree)[i]; - if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) { - array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); - } - } - } -} - // Assign all of the node's properties that depend on its children. void ts_subtree_summarize_children( MutableSubtree self, const TSLanguage *language ) { - assert(!self.data.is_inline); + ts_assert(!self.data.is_inline); self.ptr->named_child_count = 0; self.ptr->visible_child_count = 0; @@ -438,7 +407,12 @@ void ts_subtree_summarize_children( self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child); self.ptr->visible_descendant_count += ts_subtree_visible_descendant_count(child); - if (alias_sequence && alias_sequence[structural_index] != 0 && !ts_subtree_extra(child)) { + if ( + !ts_subtree_extra(child) && + ts_subtree_symbol(child) != 0 && + alias_sequence && + alias_sequence[structural_index] != 0 + ) { self.ptr->visible_descendant_count++; self.ptr->visible_child_count++; if (ts_language_symbol_metadata(language, alias_sequence[structural_index]).named) { @@ -583,16 +557,16 @@ Subtree ts_subtree_new_missing_leaf( void ts_subtree_retain(Subtree self) { if (self.data.is_inline) return; - assert(self.ptr->ref_count > 0); + ts_assert(self.ptr->ref_count > 0); atomic_inc((volatile uint32_t *)&self.ptr->ref_count); - assert(self.ptr->ref_count != 0); + ts_assert(self.ptr->ref_count != 0); } void ts_subtree_release(SubtreePool *pool, Subtree self) { if (self.data.is_inline) return; array_clear(&pool->tree_stack); - assert(self.ptr->ref_count > 0); + ts_assert(self.ptr->ref_count > 0); if (atomic_dec((volatile uint32_t *)&self.ptr->ref_count) == 0) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); } @@ -604,7 +578,7 @@ void ts_subtree_release(SubtreePool *pool, Subtree self) { for (uint32_t i = 0; i < tree.ptr->child_count; i++) { Subtree child = children[i]; if (child.data.is_inline) continue; - assert(child.ptr->ref_count > 0); + ts_assert(child.ptr->ref_count > 0); if (atomic_dec((volatile uint32_t *)&child.ptr->ref_count) == 0) { array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); } @@ -677,7 +651,8 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *input_edit, SubtreePool Edit edit = entry.edit; bool is_noop = edit.old_end.bytes == edit.start.bytes && edit.new_end.bytes == edit.start.bytes; bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes; - bool invalidate_first_row = ts_subtree_depends_on_column(*entry.tree); + bool parent_depends_on_column = ts_subtree_depends_on_column(*entry.tree); + bool column_shifted = edit.new_end.extent.column != edit.old_end.extent.column; Length size = ts_subtree_size(*entry.tree); Length padding = ts_subtree_padding(*entry.tree); @@ -699,12 +674,6 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *input_edit, SubtreePool padding = edit.new_end; } - // If the edit is a pure insertion right at the start of the subtree, - // shift the subtree over according to the insertion. - else if (edit.start.bytes == padding.bytes && is_pure_insertion) { - padding = edit.new_end; - } - // If the edit is within this subtree, resize the subtree to reflect the edit. else if ( edit.start.bytes < total_size.bytes || @@ -766,13 +735,17 @@ Subtree ts_subtree_edit(Subtree self, const TSInputEdit *input_edit, SubtreePool // Keep editing child nodes until a node is reached that starts after the edit. // Also, if this node's validity depends on its column position, then continue - // invaliditing child nodes until reaching a line break. + // invalidating child nodes until reaching a line break. if (( (child_left.bytes > edit.old_end.bytes) || (child_left.bytes == edit.old_end.bytes && child_size.bytes > 0 && i > 0) ) && ( - !invalidate_first_row || - child_left.extent.row > entry.tree->ptr->padding.extent.row + !parent_depends_on_column || + child_left.extent.row > padding.extent.row + ) && ( + !ts_subtree_depends_on_column(*child) || + !column_shifted || + child_left.extent.row > edit.old_end.extent.row )) { break; } @@ -985,6 +958,7 @@ void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, if (ts_subtree_child_count(*self) == 0) fprintf(f, ", shape=plaintext"); if (ts_subtree_extra(*self)) fprintf(f, ", fontcolor=gray"); + if (ts_subtree_has_changes(*self)) fprintf(f, ", color=green, penwidth=2"); fprintf(f, ", tooltip=\"" "range: %u - %u\n" diff --git a/subtree.h b/subtree.h index 0b3062e9..3f7cd509 100644 --- a/subtree.h +++ b/subtree.h @@ -173,44 +173,61 @@ typedef struct { MutableSubtreeArray tree_stack; } SubtreePool; -void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned); -const char *ts_external_scanner_state_data(const ExternalScannerState *); -bool ts_external_scanner_state_eq(const ExternalScannerState *self, const char *, unsigned); +void ts_external_scanner_state_init(ExternalScannerState *self, const char *data, unsigned length); +const char *ts_external_scanner_state_data(const ExternalScannerState *self); +bool ts_external_scanner_state_eq(const ExternalScannerState *self, const char *buffer, unsigned length); void ts_external_scanner_state_delete(ExternalScannerState *self); -void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); -void ts_subtree_array_clear(SubtreePool *, SubtreeArray *); -void ts_subtree_array_delete(SubtreePool *, SubtreeArray *); -void ts_subtree_array_remove_trailing_extras(SubtreeArray *, SubtreeArray *); -void ts_subtree_array_reverse(SubtreeArray *); +void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest); +void ts_subtree_array_clear(SubtreePool *pool, SubtreeArray *self); +void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self); +void ts_subtree_array_remove_trailing_extras(SubtreeArray *self, SubtreeArray *destination); +void ts_subtree_array_reverse(SubtreeArray *self); SubtreePool ts_subtree_pool_new(uint32_t capacity); -void ts_subtree_pool_delete(SubtreePool *); +void ts_subtree_pool_delete(SubtreePool *self); Subtree ts_subtree_new_leaf( - SubtreePool *, TSSymbol, Length, Length, uint32_t, - TSStateId, bool, bool, bool, const TSLanguage * + SubtreePool *pool, TSSymbol symbol, Length padding, Length size, + uint32_t lookahead_bytes, TSStateId parse_state, + bool has_external_tokens, bool depends_on_column, + bool is_keyword, const TSLanguage *language ); Subtree ts_subtree_new_error( - SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * + SubtreePool *pool, int32_t lookahead_char, Length padding, Length size, + uint32_t bytes_scanned, TSStateId parse_state, const TSLanguage *language ); -MutableSubtree ts_subtree_new_node(TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); -Subtree ts_subtree_new_error_node(SubtreeArray *, bool, const TSLanguage *); -Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, uint32_t, const TSLanguage *); -MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree); -void ts_subtree_retain(Subtree); -void ts_subtree_release(SubtreePool *, Subtree); -int ts_subtree_compare(Subtree, Subtree, SubtreePool *); -void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); -void ts_subtree_summarize(MutableSubtree, const Subtree *, uint32_t, const TSLanguage *); -void ts_subtree_summarize_children(MutableSubtree, const TSLanguage *); -void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); -Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); -char *ts_subtree_string(Subtree, TSSymbol, bool, const TSLanguage *, bool include_all); -void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *); -Subtree ts_subtree_last_external_token(Subtree); +MutableSubtree ts_subtree_new_node( + TSSymbol symbol, + SubtreeArray *chiildren, + unsigned production_id, + const TSLanguage *language +); +Subtree ts_subtree_new_error_node( + SubtreeArray *children, + bool extra, + const TSLanguage * language +); +Subtree ts_subtree_new_missing_leaf( + SubtreePool *pool, + TSSymbol symbol, + Length padding, + uint32_t lookahead_bytes, + const TSLanguage *language +); +MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self); +void ts_subtree_retain(Subtree self); +void ts_subtree_release(SubtreePool *pool, Subtree self); +int ts_subtree_compare(Subtree left, Subtree right, SubtreePool *pool); +void ts_subtree_set_symbol(MutableSubtree *self, TSSymbol symbol, const TSLanguage *language); +void ts_subtree_compress(MutableSubtree self, unsigned count, const TSLanguage *language, MutableSubtreeArray *stack); +void ts_subtree_summarize_children(MutableSubtree self, const TSLanguage *language); +Subtree ts_subtree_edit(Subtree self, const TSInputEdit *edit, SubtreePool *pool); +char *ts_subtree_string(Subtree self, TSSymbol alias_symbol, bool alias_is_named, const TSLanguage *language, bool include_all); +void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, FILE *f); +Subtree ts_subtree_last_external_token(Subtree tree); const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self); -bool ts_subtree_external_scanner_state_eq(Subtree, Subtree); +bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other); #define SUBTREE_GET(self, name) ((self).data.is_inline ? (self).data.name : (self).ptr->name) diff --git a/tree.c b/tree.c index 1cea1794..e0c60bbf 100644 --- a/tree.c +++ b/tree.c @@ -1,5 +1,3 @@ -#define _POSIX_C_SOURCE 200112L - #include "api.h" #include "./array.h" #include "./get_changed_ranges.h" @@ -148,7 +146,7 @@ void ts_tree_print_dot_graph(const TSTree *self, int fd) { fclose(file); } -#else +#elif !defined(__wasi__) // WASI doesn't support dup #include @@ -162,4 +160,11 @@ void ts_tree_print_dot_graph(const TSTree *self, int file_descriptor) { fclose(file); } +#else + +void ts_tree_print_dot_graph(const TSTree *self, int file_descriptor) { + (void)self; + (void)file_descriptor; +} + #endif diff --git a/tree.h b/tree.h index f012f888..9328f55a 100644 --- a/tree.h +++ b/tree.h @@ -21,8 +21,8 @@ struct TSTree { unsigned included_range_count; }; -TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned); -TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol); +TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *included_ranges, unsigned included_range_count); +TSNode ts_node_new(const TSTree *tree, const Subtree *subtree, Length position, TSSymbol alias); #ifdef __cplusplus } diff --git a/tree_cursor.c b/tree_cursor.c index c1a3d8a4..c7e65d89 100644 --- a/tree_cursor.c +++ b/tree_cursor.c @@ -1,5 +1,4 @@ #include "api.h" -#include "./alloc.h" #include "./tree_cursor.h" #include "./language.h" #include "./tree.h" @@ -17,11 +16,11 @@ typedef struct { // CursorChildIterator static inline bool ts_tree_cursor_is_entry_visible(const TreeCursor *self, uint32_t index) { - TreeCursorEntry *entry = &self->stack.contents[index]; + TreeCursorEntry *entry = array_get(&self->stack, index); if (index == 0 || ts_subtree_visible(*entry->subtree)) { return true; } else if (!ts_subtree_extra(*entry->subtree)) { - TreeCursorEntry *parent_entry = &self->stack.contents[index - 1]; + TreeCursorEntry *parent_entry = array_get(&self->stack, index - 1); return ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, @@ -130,14 +129,17 @@ static inline bool ts_tree_cursor_child_iterator_previous( }; *visible = ts_subtree_visible(*child); bool extra = ts_subtree_extra(*child); - if (!extra && self->alias_sequence) { - *visible |= self->alias_sequence[self->structural_child_index]; - self->structural_child_index--; - } self->position = length_backtrack(self->position, ts_subtree_padding(*child)); self->child_index--; + if (!extra && self->alias_sequence) { + *visible |= self->alias_sequence[self->structural_child_index]; + if (self->structural_child_index > 0) { + self->structural_child_index--; + } + } + // unsigned can underflow so compare it to child_count if (self->child_index < self->parent.ptr->child_count) { Subtree previous_child = ts_subtree_children(self->parent)[self->child_index]; @@ -212,7 +214,6 @@ bool ts_tree_cursor_goto_first_child(TSTreeCursor *self) { return false; } } - return false; } TreeCursorStep ts_tree_cursor_goto_last_child_internal(TSTreeCursor *_self) { @@ -253,7 +254,6 @@ bool ts_tree_cursor_goto_last_child(TSTreeCursor *self) { return false; } } - return false; } static inline int64_t ts_tree_cursor_goto_first_child_for_byte_and_point( @@ -274,7 +274,7 @@ static inline int64_t ts_tree_cursor_goto_first_child_for_byte_and_point( CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { Length entry_end = length_add(entry.position, ts_subtree_size(*entry.subtree)); - bool at_goal = entry_end.bytes >= goal_byte && point_gte(entry_end.extent, goal_point); + bool at_goal = entry_end.bytes > goal_byte && point_gt(entry_end.extent, goal_point); uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); if (at_goal) { if (visible) { @@ -307,8 +307,9 @@ int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint go } TreeCursorStep ts_tree_cursor_goto_sibling_internal( - TSTreeCursor *_self, - bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)) { + TSTreeCursor *_self, + bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *) +) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; @@ -373,7 +374,7 @@ TreeCursorStep ts_tree_cursor_goto_previous_sibling_internal(TSTreeCursor *_self return step; // restore position from the parent node - const TreeCursorEntry *parent = &self->stack.contents[self->stack.size - 2]; + const TreeCursorEntry *parent = array_get(&self->stack, self->stack.size - 2); Length position = parent->position; uint32_t child_index = array_back(&self->stack)->child_index; const Subtree *children = ts_subtree_children((*(parent->subtree))); @@ -424,7 +425,7 @@ void ts_tree_cursor_goto_descendant( // Ascend to the lowest ancestor that contains the goal node. for (;;) { uint32_t i = self->stack.size - 1; - TreeCursorEntry *entry = &self->stack.contents[i]; + TreeCursorEntry *entry = array_get(&self->stack, i); uint32_t next_descendant_index = entry->descendant_index + (ts_tree_cursor_is_entry_visible(self, i) ? 1 : 0) + @@ -475,9 +476,10 @@ uint32_t ts_tree_cursor_current_descendant_index(const TSTreeCursor *_self) { TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; TreeCursorEntry *last_entry = array_back(&self->stack); - TSSymbol alias_symbol = self->root_alias_symbol; - if (self->stack.size > 1 && !ts_subtree_extra(*last_entry->subtree)) { - TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2]; + bool is_extra = ts_subtree_extra(*last_entry->subtree); + TSSymbol alias_symbol = is_extra ? 0 : self->root_alias_symbol; + if (self->stack.size > 1 && !is_extra) { + TreeCursorEntry *parent_entry = array_get(&self->stack, self->stack.size - 2); alias_symbol = ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, @@ -514,8 +516,8 @@ void ts_tree_cursor_current_status( // Walk up the tree, visiting the current node and its invisible ancestors, // because fields can refer to nodes through invisible *wrapper* nodes, for (unsigned i = self->stack.size - 1; i > 0; i--) { - TreeCursorEntry *entry = &self->stack.contents[i]; - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + TreeCursorEntry *entry = array_get(&self->stack, i); + TreeCursorEntry *parent_entry = array_get(&self->stack, i - 1); const TSSymbol *alias_sequence = ts_language_alias_sequence( self->tree->language, @@ -628,11 +630,11 @@ uint32_t ts_tree_cursor_current_depth(const TSTreeCursor *_self) { TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; for (int i = (int)self->stack.size - 2; i >= 0; i--) { - TreeCursorEntry *entry = &self->stack.contents[i]; + TreeCursorEntry *entry = array_get(&self->stack, i); bool is_visible = true; TSSymbol alias_symbol = 0; if (i > 0) { - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + TreeCursorEntry *parent_entry = array_get(&self->stack, i - 1); alias_symbol = ts_language_alias_at( self->tree->language, parent_entry->subtree->ptr->production_id, @@ -657,8 +659,8 @@ TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { // Walk up the tree, visiting the current node and its invisible ancestors. for (unsigned i = self->stack.size - 1; i > 0; i--) { - TreeCursorEntry *entry = &self->stack.contents[i]; - TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + TreeCursorEntry *entry = array_get(&self->stack, i); + TreeCursorEntry *parent_entry = array_get(&self->stack, i - 1); // Stop walking up when another visible node is found. if ( diff --git a/tree_cursor.h b/tree_cursor.h index 96a386df..7d4e7ef0 100644 --- a/tree_cursor.h +++ b/tree_cursor.h @@ -23,19 +23,19 @@ typedef enum { TreeCursorStepVisible, } TreeCursorStep; -void ts_tree_cursor_init(TreeCursor *, TSNode); +void ts_tree_cursor_init(TreeCursor *self, TSNode node); void ts_tree_cursor_current_status( - const TSTreeCursor *, - TSFieldId *, - bool *, - bool *, - bool *, - TSSymbol *, - unsigned * + const TSTreeCursor *_self, + TSFieldId *field_id, + bool *has_later_siblings, + bool *has_later_named_siblings, + bool *can_have_later_siblings_with_this_field, + TSSymbol *supertypes, + unsigned *supertype_count ); -TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *); -TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *); +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *_self); +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *_self); static inline Subtree ts_tree_cursor_current_subtree(const TSTreeCursor *_self) { const TreeCursor *self = (const TreeCursor *)_self; @@ -43,6 +43,6 @@ static inline Subtree ts_tree_cursor_current_subtree(const TSTreeCursor *_self) return *last_entry->subtree; } -TSNode ts_tree_cursor_parent_node(const TSTreeCursor *); +TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self); #endif // TREE_SITTER_TREE_CURSOR_H_ diff --git a/ts_assert.h b/ts_assert.h new file mode 100644 index 00000000..4cb8f36a --- /dev/null +++ b/ts_assert.h @@ -0,0 +1,11 @@ +#ifndef TREE_SITTER_ASSERT_H_ +#define TREE_SITTER_ASSERT_H_ + +#ifdef NDEBUG +#define ts_assert(e) ((void)(e)) +#else +#include +#define ts_assert(e) assert(e) +#endif + +#endif // TREE_SITTER_ASSERT_H_ diff --git a/unicode.h b/unicode.h index addba621..50773e9f 100644 --- a/unicode.h +++ b/unicode.h @@ -12,34 +12,59 @@ extern "C" { #define U_EXPORT2 #include "utf8.h" #include "utf16.h" +#include "endian.h" + +#define U16_NEXT_LE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=le16toh((s)[(i)++]); \ + if(U16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} UPRV_BLOCK_MACRO_END + +#define U16_NEXT_BE(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ + (c)=be16toh((s)[(i)++]); \ + if(U16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=U16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} UPRV_BLOCK_MACRO_END static const int32_t TS_DECODE_ERROR = U_SENTINEL; -// These functions read one unicode code point from the given string, -// returning the number of bytes consumed. -typedef uint32_t (*UnicodeDecodeFunction)( +static inline uint32_t ts_decode_utf8( const uint8_t *string, uint32_t length, int32_t *code_point -); +) { + uint32_t i = 0; + U8_NEXT(string, i, length, *code_point); + return i; +} -static inline uint32_t ts_decode_utf8( +static inline uint32_t ts_decode_utf16_le( const uint8_t *string, uint32_t length, int32_t *code_point ) { uint32_t i = 0; - U8_NEXT(string, i, length, *code_point); - return i; + U16_NEXT_LE(((uint16_t *)string), i, length, *code_point); + return i * 2; } -static inline uint32_t ts_decode_utf16( +static inline uint32_t ts_decode_utf16_be( const uint8_t *string, uint32_t length, int32_t *code_point ) { uint32_t i = 0; - U16_NEXT(((uint16_t *)string), i, length, *code_point); + U16_NEXT_BE(((uint16_t *)string), i, length, *code_point); return i * 2; } diff --git a/wasm_store.c b/wasm_store.c index 7860c922..1b330896 100644 --- a/wasm_store.c +++ b/wasm_store.c @@ -4,16 +4,25 @@ #ifdef TREE_SITTER_FEATURE_WASM -#include -#include -#include #include "./alloc.h" #include "./array.h" #include "./atomic.h" #include "./language.h" #include "./lexer.h" -#include "./wasm_store.h" #include "./wasm/wasm-stdlib.h" +#include "./wasm_store.h" + +#include +#include +#include + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4100) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif #define array_len(a) (sizeof(a) / sizeof(a[0])) @@ -101,7 +110,6 @@ struct TSWasmStore { wasm_globaltype_t *const_i32_type; bool has_error; uint32_t lexer_address; - uint32_t serialization_buffer_address; }; typedef Array(char) StringData; @@ -109,7 +117,7 @@ typedef Array(char) StringData; // LanguageInWasmMemory - The memory layout of a `TSLanguage` when compiled to // wasm32. This is used to copy static language data out of the wasm memory. typedef struct { - uint32_t version; + uint32_t abi_version; uint32_t symbol_count; uint32_t alias_count; uint32_t token_count; @@ -145,6 +153,14 @@ typedef struct { int32_t deserialize; } external_scanner; int32_t primary_state_ids; + int32_t name; + int32_t reserved_words; + uint16_t max_reserved_word_set_size; + uint32_t supertype_count; + int32_t supertype_symbols; + int32_t supertype_map_slices; + int32_t supertype_map_entries; + TSLanguageMetadata metadata; } LanguageInWasmMemory; // LexerInWasmMemory - The memory layout of a `TSLexer` when compiled to wasm32. @@ -159,17 +175,15 @@ typedef struct { int32_t eof; } LexerInWasmMemory; -static volatile uint32_t NEXT_LANGUAGE_ID; - // Linear memory layout: -// [ <-- stack | stdlib statics | lexer | serialization_buffer | language statics --> | heap --> ] +// [ <-- stack | stdlib statics | lexer | language statics --> | serialization_buffer | heap --> ] #define MAX_MEMORY_SIZE (128 * 1024 * 1024 / MEMORY_PAGE_SIZE) /************************ * WasmDylinkMemoryInfo ***********************/ -static uint8_t read_u8(const uint8_t **p, const uint8_t *end) { +static uint8_t read_u8(const uint8_t **p) { return *(*p)++; } @@ -204,7 +218,7 @@ static bool wasm_dylink_info__parse( p += 4; while (p < end) { - uint8_t section_id = read_u8(&p, end); + uint8_t section_id = read_u8(&p); uint32_t section_length = read_uleb128(&p, end); const uint8_t *section_end = p + section_length; if (section_end > end) return false; @@ -217,7 +231,7 @@ static bool wasm_dylink_info__parse( if (name_length == 8 && memcmp(p, "dylink.0", 8) == 0) { p = name_end; while (p < section_end) { - uint8_t subsection_type = read_u8(&p, section_end); + uint8_t subsection_type = read_u8(&p); uint32_t subsection_size = read_uleb128(&p, section_end); const uint8_t *subsection_end = p + subsection_size; if (subsection_end > section_end) return false; @@ -258,7 +272,7 @@ static wasm_trap_t *callback__debug_message( ) { wasmtime_context_t *context = wasmtime_caller_context(caller); TSWasmStore *store = env; - assert(args_and_results_len == 2); + ts_assert(args_and_results_len == 2); uint32_t string_address = args_and_results[0].i32; uint32_t value = args_and_results[1].i32; uint8_t *memory = wasmtime_memory_data(context, &store->memory); @@ -282,7 +296,7 @@ static wasm_trap_t *callback__lexer_advance( size_t args_and_results_len ) { wasmtime_context_t *context = wasmtime_caller_context(caller); - assert(args_and_results_len == 2); + ts_assert(args_and_results_len == 2); TSWasmStore *store = env; TSLexer *lexer = store->current_lexer; @@ -408,6 +422,17 @@ static void *copy_strings( return result; } +static void *copy_string( + const uint8_t *data, + int32_t address +) { + const char *string = (const char *)&data[address]; + size_t len = strlen(string); + char *result = ts_malloc(len + 1); + memcpy(result, string, len + 1); + return result; +} + static bool name_eq(const wasm_name_t *name, const char *string) { return strncmp(string, name->data, name->size) == 0; } @@ -432,7 +457,7 @@ static inline wasm_functype_t* wasm_functype_new_4_0( snprintf(*output, message_length + 1, __VA_ARGS__); \ } while (0) -WasmLanguageId *language_id_new() { +WasmLanguageId *language_id_new(void) { WasmLanguageId *self = ts_malloc(sizeof(WasmLanguageId)); self->is_language_deleted = false; self->ref_count = 1; @@ -458,7 +483,7 @@ static wasmtime_extern_t get_builtin_extern( .kind = WASMTIME_EXTERN_FUNC, .of.func = (wasmtime_func_t) { .store_id = table->store_id, - .index = index + .__private = index } }; } @@ -476,13 +501,13 @@ static bool ts_wasm_store__provide_builtin_import( wasmtime_val_t value = WASM_I32_VAL(self->current_memory_offset); wasmtime_global_t global; error = wasmtime_global_new(context, self->const_i32_type, &value, &global); - assert(!error); + ts_assert(!error); *import = (wasmtime_extern_t) {.kind = WASMTIME_EXTERN_GLOBAL, .of.global = global}; } else if (name_eq(import_name, "__table_base")) { wasmtime_val_t value = WASM_I32_VAL(self->current_function_table_offset); wasmtime_global_t global; error = wasmtime_global_new(context, self->const_i32_type, &value, &global); - assert(!error); + ts_assert(!error); *import = (wasmtime_extern_t) {.kind = WASMTIME_EXTERN_GLOBAL, .of.global = global}; } else if (name_eq(import_name, "__stack_pointer")) { *import = (wasmtime_extern_t) {.kind = WASMTIME_EXTERN_GLOBAL, .of.global = self->stack_pointer_global}; @@ -530,7 +555,7 @@ static bool ts_wasm_store__call_module_initializer( wasmtime_context_t *context = wasmtime_store_context(self->store); wasmtime_func_t initialization_func = export->of.func; wasmtime_error_t *error = wasmtime_func_call(context, &initialization_func, NULL, 0, NULL, 0, trap); - assert(!error); + ts_assert(!error); return true; } else { return false; @@ -545,6 +570,7 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { wasm_trap_t *trap = NULL; wasm_message_t message = WASM_EMPTY_VEC; wasm_exporttype_vec_t export_types = WASM_EMPTY_VEC; + wasm_importtype_vec_t import_types = WASM_EMPTY_VEC; wasmtime_extern_t *imports = NULL; wasmtime_module_t *stdlib_module = NULL; wasm_memorytype_t *memory_type = NULL; @@ -635,14 +661,14 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { FunctionDefinition *definition = &builtin_definitions[i]; wasmtime_func_t func; wasmtime_func_new_unchecked(context, definition->type, definition->callback, self, NULL, &func); - *definition->storage_location = func.index; + *definition->storage_location = func.__private; wasm_functype_delete(definition->type); } for (unsigned i = 0; i < lexer_definitions_len; i++) { FunctionDefinition *definition = &lexer_definitions[i]; wasmtime_func_t func; wasmtime_func_new_unchecked(context, definition->type, definition->callback, self, NULL, &func); - *definition->storage_location = func.index; + *definition->storage_location = func.__private; wasm_functype_delete(definition->type); } @@ -660,11 +686,10 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { } // Retrieve the stdlib module's imports. - wasm_importtype_vec_t import_types = WASM_EMPTY_VEC; wasmtime_module_imports(stdlib_module, &import_types); // Find the initial number of memory pages needed by the stdlib. - const wasm_memorytype_t *stdlib_memory_type; + const wasm_memorytype_t *stdlib_memory_type = NULL; for (unsigned i = 0; i < import_types.size; i++) { wasm_importtype_t *import_type = import_types.data[i]; const wasm_name_t *import_name = wasm_importtype_name(import_type); @@ -729,10 +754,11 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { wasmtime_val_t stack_pointer_value = WASM_I32_VAL(0); wasmtime_global_t stack_pointer_global; error = wasmtime_global_new(context, var_i32_type, &stack_pointer_value, &stack_pointer_global); - assert(!error); + wasm_globaltype_delete(var_i32_type); + ts_assert(!error); *self = (TSWasmStore) { - .engine = engine, + .engine = wasmtime_engine_clone(engine), .store = store, .memory = memory, .function_table = function_table, @@ -801,7 +827,7 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { size_t name_len; wasmtime_extern_t export = {.kind = WASM_EXTERN_GLOBAL}; bool exists = wasmtime_instance_export_nth(context, &instance, i, &export_name, &name_len, &export); - assert(exists); + ts_assert(exists); if (export.kind == WASMTIME_EXTERN_GLOBAL) { if (name_eq(name, "__stack_pointer")) { @@ -825,13 +851,13 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { } if (name_eq(name, "reset_heap")) { - self->builtin_fn_indices.reset_heap = export.of.func.index; + self->builtin_fn_indices.reset_heap = export.of.func.__private; continue; } for (unsigned j = 0; j < stdlib_symbols_len; j++) { if (name_eq(name, STDLIB_SYMBOLS[j])) { - self->stdlib_fn_indices[j] = export.of.func.index; + self->stdlib_fn_indices[j] = export.of.func.__private; break; } } @@ -864,7 +890,7 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { // Add all of the lexer callback functions to the function table. Store their function table // indices on the in-memory lexer. - uint32_t table_index; + uint64_t table_index; error = wasmtime_table_grow(context, &function_table, lexer_definitions_len, &initializer, &table_index); if (error) { wasmtime_error_message(error, &message); @@ -881,15 +907,14 @@ TSWasmStore *ts_wasm_store_new(TSWasmEngine *engine, TSWasmError *wasm_error) { wasmtime_func_t func = {function_table.store_id, *definition->storage_location}; wasmtime_val_t func_val = {.kind = WASMTIME_FUNCREF, .of.funcref = func}; error = wasmtime_table_set(context, &function_table, table_index, &func_val); - assert(!error); + ts_assert(!error); *(int32_t *)(definition->storage_location) = table_index; table_index++; } self->current_function_table_offset = table_index; self->lexer_address = initial_memory_pages * MEMORY_PAGE_SIZE; - self->serialization_buffer_address = self->lexer_address + sizeof(LexerInWasmMemory); - self->current_memory_offset = self->serialization_buffer_address + TREE_SITTER_SERIALIZATION_BUFFER_SIZE; + self->current_memory_offset = self->lexer_address + sizeof(LexerInWasmMemory); // Grow the memory enough to hold the builtin lexer and serialization buffer. uint32_t new_pages_needed = (self->current_memory_offset - self->lexer_address - 1) / MEMORY_PAGE_SIZE + 1; @@ -922,7 +947,7 @@ void ts_wasm_store_delete(TSWasmStore *self) { wasmtime_store_delete(self->store); wasm_engine_delete(self->engine); for (unsigned i = 0; i < self->language_instances.size; i++) { - LanguageWasmInstance *instance = &self->language_instances.contents[i]; + LanguageWasmInstance *instance = array_get(&self->language_instances, i); language_id_delete(instance->language_id); } array_delete(&self->language_instances); @@ -932,7 +957,7 @@ void ts_wasm_store_delete(TSWasmStore *self) { size_t ts_wasm_store_language_count(const TSWasmStore *self) { size_t result = 0; for (unsigned i = 0; i < self->language_instances.size; i++) { - const WasmLanguageId *id = self->language_instances.contents[i].language_id; + const WasmLanguageId *id = array_get(&self->language_instances, i)->language_id; if (!id->is_language_deleted) { result++; } @@ -940,6 +965,14 @@ size_t ts_wasm_store_language_count(const TSWasmStore *self) { return result; } +static uint32_t ts_wasm_store__heap_address(TSWasmStore *self) { + return self->current_memory_offset + TREE_SITTER_SERIALIZATION_BUFFER_SIZE; +} + +static uint32_t ts_wasm_store__serialization_buffer_address(TSWasmStore *self) { + return self->current_memory_offset; +} + static bool ts_wasm_store__instantiate( TSWasmStore *self, wasmtime_module_t *module, @@ -958,7 +991,7 @@ static bool ts_wasm_store__instantiate( // Grow the function table to make room for the new functions. wasmtime_val_t initializer = {.kind = WASMTIME_FUNCREF}; - uint32_t prev_table_size; + uint64_t prev_table_size; error = wasmtime_table_grow(context, &self->function_table, dylink_info->table_size, &initializer, &prev_table_size); if (error) { format(error_message, "invalid function table size %u", dylink_info->table_size); @@ -966,7 +999,7 @@ static bool ts_wasm_store__instantiate( } // Grow the memory to make room for the new data. - uint32_t needed_memory_size = self->current_memory_offset + dylink_info->memory_size; + uint32_t needed_memory_size = ts_wasm_store__heap_address(self) + dylink_info->memory_size; uint32_t current_memory_size = wasmtime_memory_data_size(context, &self->memory); if (needed_memory_size > current_memory_size) { uint32_t pages_to_grow = ( @@ -1062,7 +1095,7 @@ static bool ts_wasm_store__instantiate( char *export_name; wasmtime_extern_t export = {.kind = WASM_EXTERN_GLOBAL}; bool exists = wasmtime_instance_export_nth(context, &instance, i, &export_name, &name_len, &export); - assert(exists); + ts_assert(exists); // If the module exports an initialization or data-relocation function, call it. if (ts_wasm_store__call_module_initializer(self, name, &export, &trap)) { @@ -1098,7 +1131,7 @@ static bool ts_wasm_store__instantiate( wasmtime_func_t language_func = language_extern.of.func; wasmtime_val_t language_address_val; error = wasmtime_func_call(context, &language_func, NULL, 0, &language_address_val, 1, &trap); - assert(!error); + ts_assert(!error); if (trap) { wasm_trap_message(trap, &message); format( @@ -1188,31 +1221,32 @@ const TSLanguage *ts_wasm_store_load_language( const uint8_t *memory = wasmtime_memory_data(context, &self->memory); memcpy(&wasm_language, &memory[language_address], sizeof(LanguageInWasmMemory)); - if (wasm_language.version < LANGUAGE_VERSION_USABLE_VIA_WASM) { - wasm_error->kind = TSWasmErrorKindInstantiate; - format(&wasm_error->message, "language version %u is too old for wasm", wasm_language.version); - goto error; - } + bool has_supertypes = + wasm_language.abi_version > LANGUAGE_VERSION_WITH_RESERVED_WORDS && + wasm_language.supertype_count > 0; int32_t addresses[] = { - wasm_language.alias_map, - wasm_language.alias_sequences, - wasm_language.field_map_entries, - wasm_language.field_map_slices, - wasm_language.field_names, - wasm_language.keyword_lex_fn, - wasm_language.lex_fn, - wasm_language.lex_modes, - wasm_language.parse_actions, wasm_language.parse_table, - wasm_language.primary_state_ids, - wasm_language.primary_state_ids, - wasm_language.public_symbol_map, wasm_language.small_parse_table, wasm_language.small_parse_table_map, - wasm_language.symbol_metadata, - wasm_language.symbol_metadata, + wasm_language.parse_actions, wasm_language.symbol_names, + wasm_language.field_names, + wasm_language.field_map_slices, + wasm_language.field_map_entries, + wasm_language.symbol_metadata, + wasm_language.public_symbol_map, + wasm_language.alias_map, + wasm_language.alias_sequences, + wasm_language.lex_modes, + wasm_language.lex_fn, + wasm_language.keyword_lex_fn, + wasm_language.primary_state_ids, + wasm_language.name, + wasm_language.reserved_words, + has_supertypes ? wasm_language.supertype_symbols : 0, + has_supertypes ? wasm_language.supertype_map_entries : 0, + has_supertypes ? wasm_language.supertype_map_slices : 0, wasm_language.external_token_count > 0 ? wasm_language.external_scanner.states : 0, wasm_language.external_token_count > 0 ? wasm_language.external_scanner.symbol_map : 0, wasm_language.external_token_count > 0 ? wasm_language.external_scanner.create : 0, @@ -1230,7 +1264,7 @@ const TSLanguage *ts_wasm_store_load_language( StringData field_name_buffer = array_new(); *language = (TSLanguage) { - .version = wasm_language.version, + .abi_version = wasm_language.abi_version, .symbol_count = wasm_language.symbol_count, .alias_count = wasm_language.alias_count, .token_count = wasm_language.token_count, @@ -1239,8 +1273,10 @@ const TSLanguage *ts_wasm_store_load_language( .large_state_count = wasm_language.large_state_count, .production_id_count = wasm_language.production_id_count, .field_count = wasm_language.field_count, + .supertype_count = wasm_language.supertype_count, .max_alias_sequence_length = wasm_language.max_alias_sequence_length, .keyword_capture_token = wasm_language.keyword_capture_token, + .metadata = wasm_language.metadata, .parse_table = copy( &memory[wasm_language.parse_table], wasm_language.large_state_count * wasm_language.symbol_count * sizeof(uint16_t) @@ -1267,19 +1303,30 @@ const TSLanguage *ts_wasm_store_load_language( ), .lex_modes = copy( &memory[wasm_language.lex_modes], - wasm_language.state_count * sizeof(TSLexMode) + wasm_language.state_count * sizeof(TSLexerMode) ), }; if (language->field_count > 0 && language->production_id_count > 0) { language->field_map_slices = copy( &memory[wasm_language.field_map_slices], - wasm_language.production_id_count * sizeof(TSFieldMapSlice) + wasm_language.production_id_count * sizeof(TSMapSlice) ); - const TSFieldMapSlice last_field_map_slice = language->field_map_slices[language->production_id_count - 1]; + + // Determine the number of field map entries by finding the greatest index + // in any of the slices. + uint32_t field_map_entry_count = 0; + for (uint32_t i = 0; i < wasm_language.production_id_count; i++) { + TSMapSlice slice = language->field_map_slices[i]; + uint32_t slice_end = slice.index + slice.length; + if (slice_end > field_map_entry_count) { + field_map_entry_count = slice_end; + } + } + language->field_map_entries = copy( &memory[wasm_language.field_map_entries], - (last_field_map_slice.index + last_field_map_slice.length) * sizeof(TSFieldMapEntry) + field_map_entry_count * sizeof(TSFieldMapEntry) ); language->field_names = copy_strings( memory, @@ -1289,6 +1336,37 @@ const TSLanguage *ts_wasm_store_load_language( ); } + if (has_supertypes) { + language->supertype_symbols = copy( + &memory[wasm_language.supertype_symbols], + wasm_language.supertype_count * sizeof(TSSymbol) + ); + + // Determine the number of supertype map slices by finding the greatest + // supertype ID. + int largest_supertype = 0; + for (unsigned i = 0; i < language->supertype_count; i++) { + TSSymbol supertype = language->supertype_symbols[i]; + if (supertype > largest_supertype) { + largest_supertype = supertype; + } + } + + language->supertype_map_slices = copy( + &memory[wasm_language.supertype_map_slices], + (largest_supertype + 1) * sizeof(TSMapSlice) + ); + + TSSymbol last_supertype = language->supertype_symbols[language->supertype_count - 1]; + TSMapSlice last_slice = language->supertype_map_slices[last_supertype]; + uint32_t supertype_map_entry_count = last_slice.index + last_slice.length; + + language->supertype_map_entries = copy( + &memory[wasm_language.supertype_map_entries], + supertype_map_entry_count * sizeof(char *) + ); + } + if (language->max_alias_sequence_length > 0 && language->production_id_count > 0) { // The alias map contains symbols, alias counts, and aliases, terminated by a null symbol. int32_t alias_map_size = 0; @@ -1325,13 +1403,22 @@ const TSLanguage *ts_wasm_store_load_language( ); } - if (language->version >= LANGUAGE_VERSION_WITH_PRIMARY_STATES) { + if (language->abi_version >= LANGUAGE_VERSION_WITH_PRIMARY_STATES) { language->primary_state_ids = copy( &memory[wasm_language.primary_state_ids], wasm_language.state_count * sizeof(TSStateId) ); } + if (language->abi_version >= LANGUAGE_VERSION_WITH_RESERVED_WORDS) { + language->name = copy_string(memory, wasm_language.name); + language->reserved_words = copy( + &memory[wasm_language.reserved_words], + wasm_language.max_reserved_word_set_size * sizeof(TSSymbol) + ); + language->max_reserved_word_set_size = wasm_language.max_reserved_word_set_size; + } + if (language->external_token_count > 0) { language->external_scanner.symbol_map = copy( &memory[wasm_language.external_scanner.symbol_map], @@ -1360,11 +1447,11 @@ const TSLanguage *ts_wasm_store_load_language( // to mark this language as WASM-based and to store the language's // WASM-specific data. language->lex_fn = ts_wasm_store__sentinel_lex_fn; - language->keyword_lex_fn = (void *)language_module; + language->keyword_lex_fn = (bool (*)(TSLexer *, TSStateId))language_module; // Clear out any instances of languages that have been deleted. for (unsigned i = 0; i < self->language_instances.size; i++) { - WasmLanguageId *id = self->language_instances.contents[i].language_id; + WasmLanguageId *id = array_get(&self->language_instances, i)->language_id; if (id->is_language_deleted) { language_id_delete(id); array_erase(&self->language_instances, i); @@ -1405,7 +1492,7 @@ bool ts_wasm_store_add_language( // instances of languages that have been deleted. bool exists = false; for (unsigned i = 0; i < self->language_instances.size; i++) { - WasmLanguageId *id = self->language_instances.contents[i].language_id; + WasmLanguageId *id = array_get(&self->language_instances, i)->language_id; if (id->is_language_deleted) { language_id_delete(id); array_erase(&self->language_instances, i); @@ -1464,19 +1551,19 @@ void ts_wasm_store_reset_heap(TSWasmStore *self) { }; wasm_trap_t *trap = NULL; wasmtime_val_t args[1] = { - {.of.i32 = self->current_memory_offset, .kind = WASMTIME_I32}, + {.of.i32 = ts_wasm_store__heap_address(self), .kind = WASMTIME_I32}, }; wasmtime_error_t *error = wasmtime_func_call(context, &func, args, 1, NULL, 0, &trap); - assert(!error); - assert(!trap); + ts_assert(!error); + ts_assert(!trap); } bool ts_wasm_store_start(TSWasmStore *self, TSLexer *lexer, const TSLanguage *language) { uint32_t instance_index; if (!ts_wasm_store_add_language(self, language, &instance_index)) return false; self->current_lexer = lexer; - self->current_instance = &self->language_instances.contents[instance_index]; + self->current_instance = array_get(&self->language_instances, instance_index); self->has_error = false; ts_wasm_store_reset_heap(self); return true; @@ -1498,8 +1585,8 @@ static void ts_wasm_store__call( wasmtime_context_t *context = wasmtime_store_context(self->store); wasmtime_val_t value; bool succeeded = wasmtime_table_get(context, &self->function_table, function_index, &value); - assert(succeeded); - assert(value.kind == WASMTIME_FUNCREF); + ts_assert(succeeded); + ts_assert(value.kind == WASMTIME_FUNCREF); wasmtime_func_t func = value.of.funcref; wasm_trap_t *trap = NULL; @@ -1527,13 +1614,22 @@ static void ts_wasm_store__call( } } +// The data fields of TSLexer, without the function pointers. +// +// This portion of the struct needs to be copied in and out +// of wasm memory before and after calling a scan function. +typedef struct { + int32_t lookahead; + TSSymbol result_symbol; +} TSLexerDataPrefix; + static bool ts_wasm_store__call_lex_function(TSWasmStore *self, unsigned function_index, TSStateId state) { wasmtime_context_t *context = wasmtime_store_context(self->store); uint8_t *memory_data = wasmtime_memory_data(context, &self->memory); memcpy( &memory_data[self->lexer_address], - &self->current_lexer->lookahead, - sizeof(self->current_lexer->lookahead) + self->current_lexer, + sizeof(TSLexerDataPrefix) ); wasmtime_val_raw_t args[2] = { @@ -1545,9 +1641,9 @@ static bool ts_wasm_store__call_lex_function(TSWasmStore *self, unsigned functio bool result = args[0].i32; memcpy( - &self->current_lexer->lookahead, + self->current_lexer, &memory_data[self->lexer_address], - sizeof(self->current_lexer->lookahead) + sizeof(self->current_lexer->result_symbol) + sizeof(TSLexerDataPrefix) ); return result; } @@ -1592,8 +1688,8 @@ bool ts_wasm_store_call_scanner_scan( memcpy( &memory_data[self->lexer_address], - &self->current_lexer->lookahead, - sizeof(self->current_lexer->lookahead) + self->current_lexer, + sizeof(TSLexerDataPrefix) ); uint32_t valid_tokens_address = @@ -1608,9 +1704,9 @@ bool ts_wasm_store_call_scanner_scan( if (self->has_error) return false; memcpy( - &self->current_lexer->lookahead, + self->current_lexer, &memory_data[self->lexer_address], - sizeof(self->current_lexer->lookahead) + sizeof(self->current_lexer->result_symbol) + sizeof(TSLexerDataPrefix) ); return args[0].i32; } @@ -1622,20 +1718,25 @@ uint32_t ts_wasm_store_call_scanner_serialize( ) { wasmtime_context_t *context = wasmtime_store_context(self->store); uint8_t *memory_data = wasmtime_memory_data(context, &self->memory); + uint32_t serialization_buffer_address = ts_wasm_store__serialization_buffer_address(self); wasmtime_val_raw_t args[2] = { {.i32 = scanner_address}, - {.i32 = self->serialization_buffer_address}, + {.i32 = serialization_buffer_address}, }; ts_wasm_store__call(self, self->current_instance->scanner_serialize_fn_index, args, 2); if (self->has_error) return 0; uint32_t length = args[0].i32; + if (length > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + self->has_error = true; + return 0; + } if (length > 0) { memcpy( ((Lexer *)self->current_lexer)->debug_buffer, - &memory_data[self->serialization_buffer_address], + &memory_data[serialization_buffer_address], length ); } @@ -1650,10 +1751,11 @@ void ts_wasm_store_call_scanner_deserialize( ) { wasmtime_context_t *context = wasmtime_store_context(self->store); uint8_t *memory_data = wasmtime_memory_data(context, &self->memory); + uint32_t serialization_buffer_address = ts_wasm_store__serialization_buffer_address(self); if (length > 0) { memcpy( - &memory_data[self->serialization_buffer_address], + &memory_data[serialization_buffer_address], buffer, length ); @@ -1661,7 +1763,7 @@ void ts_wasm_store_call_scanner_deserialize( wasmtime_val_raw_t args[3] = { {.i32 = scanner_address}, - {.i32 = self->serialization_buffer_address}, + {.i32 = serialization_buffer_address}, {.i32 = length}, }; ts_wasm_store__call(self, self->current_instance->scanner_deserialize_fn_index, args, 3); @@ -1681,13 +1783,13 @@ static inline LanguageWasmModule *ts_language__wasm_module(const TSLanguage *sel void ts_wasm_language_retain(const TSLanguage *self) { LanguageWasmModule *module = ts_language__wasm_module(self); - assert(module->ref_count > 0); + ts_assert(module->ref_count > 0); atomic_inc(&module->ref_count); } void ts_wasm_language_release(const TSLanguage *self) { LanguageWasmModule *module = ts_language__wasm_module(self); - assert(module->ref_count > 0); + ts_assert(module->ref_count > 0); if (atomic_dec(&module->ref_count) == 0) { // Update the language id to reflect that the language is deleted. This allows any wasm stores // that hold wasm instances for this language to delete those instances. @@ -1705,8 +1807,13 @@ void ts_wasm_language_release(const TSLanguage *self) { ts_free((void *)self->external_scanner.symbol_map); ts_free((void *)self->field_map_entries); ts_free((void *)self->field_map_slices); + ts_free((void *)self->supertype_symbols); + ts_free((void *)self->supertype_map_entries); + ts_free((void *)self->supertype_map_slices); ts_free((void *)self->field_names); ts_free((void *)self->lex_modes); + ts_free((void *)self->name); + ts_free((void *)self->reserved_words); ts_free((void *)self->parse_actions); ts_free((void *)self->parse_table); ts_free((void *)self->primary_state_ids); @@ -1719,6 +1826,12 @@ void ts_wasm_language_release(const TSLanguage *self) { } } +#ifdef _MSC_VER +#pragma warning(pop) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + #else // If the WASM feature is not enabled, define dummy versions of all of the diff --git a/wasm_store.h b/wasm_store.h index 1ad2ae57..16d48ba7 100644 --- a/wasm_store.h +++ b/wasm_store.h @@ -8,21 +8,21 @@ extern "C" { #include "api.h" #include "./parser.h" -bool ts_wasm_store_start(TSWasmStore *, TSLexer *, const TSLanguage *); -void ts_wasm_store_reset(TSWasmStore *); -bool ts_wasm_store_has_error(const TSWasmStore *); +bool ts_wasm_store_start(TSWasmStore *self, TSLexer *lexer, const TSLanguage *language); +void ts_wasm_store_reset(TSWasmStore *self); +bool ts_wasm_store_has_error(const TSWasmStore *self); -bool ts_wasm_store_call_lex_main(TSWasmStore *, TSStateId); -bool ts_wasm_store_call_lex_keyword(TSWasmStore *, TSStateId); +bool ts_wasm_store_call_lex_main(TSWasmStore *self, TSStateId state); +bool ts_wasm_store_call_lex_keyword(TSWasmStore *self, TSStateId state); -uint32_t ts_wasm_store_call_scanner_create(TSWasmStore *); -void ts_wasm_store_call_scanner_destroy(TSWasmStore *, uint32_t); -bool ts_wasm_store_call_scanner_scan(TSWasmStore *, uint32_t, uint32_t); -uint32_t ts_wasm_store_call_scanner_serialize(TSWasmStore *, uint32_t, char *); -void ts_wasm_store_call_scanner_deserialize(TSWasmStore *, uint32_t, const char *, unsigned); +uint32_t ts_wasm_store_call_scanner_create(TSWasmStore *self); +void ts_wasm_store_call_scanner_destroy(TSWasmStore *self, uint32_t scanner_address); +bool ts_wasm_store_call_scanner_scan(TSWasmStore *self, uint32_t scanner_address, uint32_t valid_tokens_ix); +uint32_t ts_wasm_store_call_scanner_serialize(TSWasmStore *self, uint32_t scanner_address, char *buffer); +void ts_wasm_store_call_scanner_deserialize(TSWasmStore *self, uint32_t scanner, const char *buffer, unsigned length); -void ts_wasm_language_retain(const TSLanguage *); -void ts_wasm_language_release(const TSLanguage *); +void ts_wasm_language_retain(const TSLanguage *self); +void ts_wasm_language_release(const TSLanguage *self); #ifdef __cplusplus }