|
| 1 | +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Diego Alonso < [email protected]> |
| 3 | +Date: Thu, 26 Jun 2025 14:04:51 +0200 |
| 4 | +Subject: Allow to parse macro identifiers in variable decls |
| 5 | + |
| 6 | +--- |
| 7 | + grammar.js | 2 + |
| 8 | + src/scanner.c | 128 +++++++++++++++++++++++++++++++++++++++----------- |
| 9 | + 2 files changed, 103 insertions(+), 27 deletions(-) |
| 10 | + |
| 11 | +diff --git a/grammar.js b/grammar.js |
| 12 | +index 6e79004..40ac8b7 100644 |
| 13 | +--- a/grammar.js |
| 14 | ++++ b/grammar.js |
| 15 | +@@ -67,6 +67,7 @@ module.exports = grammar({ |
| 16 | + $._external_end_of_statement, |
| 17 | + $._preproc_unary_operator, |
| 18 | + $.hollerith_constant, |
| 19 | ++ $.macro_identifier, |
| 20 | + ], |
| 21 | + |
| 22 | + extras: $ => [ |
| 23 | +@@ -870,6 +871,7 @@ module.exports = grammar({ |
| 24 | + $.derived_type, |
| 25 | + alias($.procedure_declaration, $.procedure), |
| 26 | + $.declared_type, |
| 27 | ++ $.macro_identifier, |
| 28 | + )), |
| 29 | + optional(seq(',', |
| 30 | + commaSep1( |
| 31 | +diff --git a/src/scanner.c b/src/scanner.c |
| 32 | +index b768d99..e477df4 100644 |
| 33 | +--- a/src/scanner.c |
| 34 | ++++ b/src/scanner.c |
| 35 | +@@ -1,4 +1,5 @@ |
| 36 | + #include "tree_sitter/alloc.h" |
| 37 | ++#include "tree_sitter/array.h" |
| 38 | + #include "tree_sitter/parser.h" |
| 39 | + #include <ctype.h> |
| 40 | + #include <wctype.h> |
| 41 | +@@ -13,10 +14,12 @@ enum TokenType { |
| 42 | + END_OF_STATEMENT, |
| 43 | + PREPROC_UNARY_OPERATOR, |
| 44 | + HOLLERITH_CONSTANT, |
| 45 | ++ MACRO_IDENTIFIER, |
| 46 | + }; |
| 47 | + |
| 48 | + typedef struct { |
| 49 | + bool in_line_continuation; |
| 50 | ++ Array(char *) MacroIdentifiers; |
| 51 | + } Scanner; |
| 52 | + |
| 53 | + typedef enum { |
| 54 | +@@ -301,31 +304,44 @@ static bool scan_end_line_continuation(Scanner *scanner, TSLexer *lexer) { |
| 55 | + return true; |
| 56 | + } |
| 57 | + |
| 58 | +-static bool scan_string_literal_kind(TSLexer *lexer) { |
| 59 | +- // Strictly, it's allowed for the kind to be an integer literal, in |
| 60 | +- // practice I've not seen it |
| 61 | ++typedef Array(char) String; |
| 62 | ++ |
| 63 | ++// Returns NULL on error, otherwise an allocated char array for an identifier |
| 64 | ++static String *scan_identifier(TSLexer *lexer) { |
| 65 | + if (!iswalpha(lexer->lookahead)) { |
| 66 | ++ return NULL; |
| 67 | ++ } |
| 68 | ++ String *possible_identifier = ts_calloc(1, sizeof(String)); |
| 69 | ++ while (is_identifier_char(lexer->lookahead) && !lexer->eof(lexer)) { |
| 70 | ++ array_push(possible_identifier, lexer->lookahead); |
| 71 | ++ // Don't capture the trailing underscore as part of the kind identifier |
| 72 | ++ // If another user of this function wants to mark the end again after |
| 73 | ++ // the identifier they're free to do so |
| 74 | ++ if (lexer->lookahead == '_') { |
| 75 | ++ lexer->mark_end(lexer); |
| 76 | ++ } |
| 77 | ++ advance(lexer); |
| 78 | ++ } |
| 79 | ++ if (possible_identifier->size == 0) { |
| 80 | ++ array_delete(possible_identifier); |
| 81 | ++ ts_free(possible_identifier); |
| 82 | ++ return NULL; |
| 83 | ++ } |
| 84 | ++ return possible_identifier; |
| 85 | ++} |
| 86 | ++ |
| 87 | ++static bool scan_string_literal_kind(TSLexer *lexer, String *identifier) { |
| 88 | ++ if (identifier->size == 0) { |
| 89 | ++ return false; |
| 90 | ++ } |
| 91 | ++ |
| 92 | ++ char last_char = identifier->contents[identifier->size - 1]; |
| 93 | ++ if ((last_char != '_') || |
| 94 | ++ (lexer->lookahead != '"' && lexer->lookahead != '\'')) { |
| 95 | + return false; |
| 96 | + } |
| 97 | + |
| 98 | + lexer->result_symbol = STRING_LITERAL_KIND; |
| 99 | +- |
| 100 | +- // We need two characters of lookahead to see `_"` |
| 101 | +- char current_char = '\0'; |
| 102 | +- |
| 103 | +- while (is_identifier_char(lexer->lookahead) && !lexer->eof(lexer)) { |
| 104 | +- current_char = lexer->lookahead; |
| 105 | +- // Don't capture the trailing underscore as part of the kind identifier |
| 106 | +- if (lexer->lookahead == '_') { |
| 107 | +- lexer->mark_end(lexer); |
| 108 | +- } |
| 109 | +- advance(lexer); |
| 110 | +- } |
| 111 | +- |
| 112 | +- if ((current_char != '_') || (lexer->lookahead != '"' && lexer->lookahead != '\'')) { |
| 113 | +- return false; |
| 114 | +- } |
| 115 | +- |
| 116 | + return true; |
| 117 | + } |
| 118 | + |
| 119 | +@@ -393,6 +409,28 @@ static bool scan_string_literal(TSLexer *lexer) { |
| 120 | + return false; |
| 121 | + } |
| 122 | + |
| 123 | ++static bool scan_macro_identifier(Scanner *scanner, TSLexer *lexer, |
| 124 | ++ String *identifier) { |
| 125 | ++ unsigned num_macro_ids = scanner->MacroIdentifiers.size; |
| 126 | ++ if (num_macro_ids == 0) { |
| 127 | ++ return false; |
| 128 | ++ } |
| 129 | ++ |
| 130 | ++ for (size_t i = 0, end = scanner->MacroIdentifiers.size; i < end; ++i) { |
| 131 | ++ char *macro_id = *array_get(&scanner->MacroIdentifiers, i); |
| 132 | ++ unsigned macro_id_len = strlen(macro_id); |
| 133 | ++ if (identifier->size != macro_id_len) { |
| 134 | ++ continue; |
| 135 | ++ } |
| 136 | ++ if (strncmp(macro_id, identifier->contents, identifier->size) == 0) { |
| 137 | ++ lexer->mark_end(lexer); |
| 138 | ++ lexer->result_symbol = MACRO_IDENTIFIER; |
| 139 | ++ return true; |
| 140 | ++ } |
| 141 | ++ } |
| 142 | ++ return false; |
| 143 | ++} |
| 144 | ++ |
| 145 | + /// Need an external scanner to catch '!' before its parsed as a comment |
| 146 | + static bool scan_preproc_unary_operator(TSLexer *lexer) { |
| 147 | + const char next_char = lexer->lookahead; |
| 148 | +@@ -467,19 +505,50 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { |
| 149 | + return true; |
| 150 | + } |
| 151 | + |
| 152 | +- if (valid_symbols[STRING_LITERAL_KIND]) { |
| 153 | ++ // These symbols both scan for an identifier, we need to combine the logic |
| 154 | ++ // and they always need to be the last to look for since we can't backtrack |
| 155 | ++ if (valid_symbols[STRING_LITERAL_KIND] || valid_symbols[MACRO_IDENTIFIER]) { |
| 156 | ++ String *identifier = scan_identifier(lexer); |
| 157 | ++ bool identifier_result = false; |
| 158 | + // This may need a lot of lookahead, so should (probably) always |
| 159 | + // be the last token to look for |
| 160 | +- if (scan_string_literal_kind(lexer)) { |
| 161 | ++ if (identifier && valid_symbols[STRING_LITERAL_KIND]) { |
| 162 | ++ if (scan_string_literal_kind(lexer, identifier)) { |
| 163 | ++ identifier_result = true; |
| 164 | ++ } |
| 165 | ++ } |
| 166 | ++ if (!identifier_result && identifier && valid_symbols[MACRO_IDENTIFIER]) { |
| 167 | ++ if (scan_macro_identifier(scanner, lexer, identifier)) { |
| 168 | ++ identifier_result = true; |
| 169 | ++ } |
| 170 | ++ } |
| 171 | ++ if (identifier) { |
| 172 | ++ ts_free(identifier); |
| 173 | ++ } |
| 174 | ++ if (identifier_result) { |
| 175 | + return true; |
| 176 | + } |
| 177 | + } |
| 178 | +- |
| 179 | + return false; |
| 180 | + } |
| 181 | + |
| 182 | + void *tree_sitter_fortran_external_scanner_create() { |
| 183 | +- return ts_calloc(1, sizeof(bool)); |
| 184 | ++ Scanner *result = (Scanner *)ts_calloc(1, sizeof(Scanner)); |
| 185 | ++ char *macro_ids = getenv("CODEE_TS_MACRO_IDS"); |
| 186 | ++ if (!macro_ids) { |
| 187 | ++ return result; |
| 188 | ++ } |
| 189 | ++ char *macro_id = strtok(macro_ids, ":"); |
| 190 | ++ Array(char *) *macroIdsResult = &result->MacroIdentifiers; |
| 191 | ++ while (macro_id) { |
| 192 | ++ int length = strlen(macro_id); |
| 193 | ++ char *new_str = (char *)ts_malloc((length + 1) * sizeof(char)); |
| 194 | ++ strncpy(new_str, macro_id, length); |
| 195 | ++ array_push(macroIdsResult, new_str); |
| 196 | ++ // Keep splitting |
| 197 | ++ macro_id = strtok(NULL, ":"); |
| 198 | ++ } |
| 199 | ++ return result; |
| 200 | + } |
| 201 | + |
| 202 | + bool tree_sitter_fortran_external_scanner_scan(void *payload, TSLexer *lexer, |
| 203 | +@@ -491,8 +560,9 @@ bool tree_sitter_fortran_external_scanner_scan(void *payload, TSLexer *lexer, |
| 204 | + unsigned tree_sitter_fortran_external_scanner_serialize(void *payload, |
| 205 | + char *buffer) { |
| 206 | + Scanner *scanner = (Scanner *)payload; |
| 207 | +- buffer[0] = (char)scanner->in_line_continuation; |
| 208 | +- return 1; |
| 209 | ++ unsigned size = sizeof(*scanner); |
| 210 | ++ memcpy(buffer, scanner, size); |
| 211 | ++ return size; |
| 212 | + } |
| 213 | + |
| 214 | + void tree_sitter_fortran_external_scanner_deserialize(void *payload, |
| 215 | +@@ -500,11 +570,17 @@ void tree_sitter_fortran_external_scanner_deserialize(void *payload, |
| 216 | + unsigned length) { |
| 217 | + Scanner *scanner = (Scanner *)payload; |
| 218 | + if (length > 0) { |
| 219 | +- scanner->in_line_continuation = buffer[0]; |
| 220 | ++ unsigned size = sizeof(*scanner); |
| 221 | ++ memcpy(scanner, buffer, size); |
| 222 | + } |
| 223 | + } |
| 224 | + |
| 225 | + void tree_sitter_fortran_external_scanner_destroy(void *payload) { |
| 226 | + Scanner *scanner = (Scanner *)payload; |
| 227 | ++ for (size_t i = 0, end = scanner->MacroIdentifiers.size; i < end; ++i) { |
| 228 | ++ char *str = *array_get(&scanner->MacroIdentifiers, i); |
| 229 | ++ ts_free(str); |
| 230 | ++ } |
| 231 | ++ array_delete(&scanner->MacroIdentifiers); |
| 232 | + ts_free(scanner); |
| 233 | + } |
0 commit comments