@@ -89,7 +89,7 @@ public function startLexing(string $code, ErrorHandler $errorHandler = null) {
89
89
90
90
error_clear_last ();
91
91
$ this ->tokens = @token_get_all ($ code );
92
- $ this ->handleErrors ($ errorHandler );
92
+ $ this ->postprocessTokens ($ errorHandler );
93
93
94
94
if (false !== $ scream ) {
95
95
ini_set ('xdebug.scream ' , $ scream );
@@ -131,40 +131,14 @@ private function isUnterminatedComment($token) : bool {
131
131
&& substr ($ token [1 ], -2 ) !== '*/ ' ;
132
132
}
133
133
134
- /**
135
- * Check whether an error *may* have occurred during tokenization.
136
- *
137
- * @return bool
138
- */
139
- private function errorMayHaveOccurred () : bool {
140
- if (defined ('HHVM_VERSION ' )) {
141
- // In HHVM token_get_all() does not throw warnings, so we need to conservatively
142
- // assume that an error occurred
143
- return true ;
144
- }
145
-
146
- if (PHP_VERSION_ID >= 80000 ) {
147
- // PHP 8 converts the "bad character" case into a parse error, rather than treating
148
- // it as a lexing warning. To preserve previous behavior, we need to assume that an
149
- // error occurred.
150
- // TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER
151
- // token here (for older PHP versions) and leave generationg of the actual parse error
152
- // to the parser. This will also save the full token scan on PHP 8 here.
153
- return true ;
154
- }
155
-
156
- return null !== error_get_last ();
157
- }
158
-
159
- protected function handleErrors (ErrorHandler $ errorHandler ) {
160
- if (!$ this ->errorMayHaveOccurred ()) {
161
- return ;
162
- }
163
-
134
+ protected function postprocessTokens (ErrorHandler $ errorHandler ) {
164
135
// PHP's error handling for token_get_all() is rather bad, so if we want detailed
165
136
// error information we need to compute it ourselves. Invalid character errors are
166
137
// detected by finding "gaps" in the token array. Unterminated comments are detected
167
138
// by checking if a trailing comment has a "*/" at the end.
139
+ //
140
+ // Additionally, we canonicalize to the PHP 8 comment format here, which does not include
141
+ // the trailing whitespace anymore
168
142
169
143
$ filePos = 0 ;
170
144
$ line = 1 ;
@@ -178,6 +152,23 @@ protected function handleErrors(ErrorHandler $errorHandler) {
178
152
$ this ->handleInvalidCharacterRange ($ filePos , $ filePos + 1 , $ line , $ errorHandler );
179
153
}
180
154
155
+ if ($ token [0 ] === \T_COMMENT && preg_match ('/(\r\n|\n|\r)$/D ' , $ token [1 ], $ matches )) {
156
+ $ trailingNewline = $ matches [0 ];
157
+ $ token [1 ] = substr ($ token [1 ], 0 , -strlen ($ trailingNewline ));
158
+ $ this ->tokens [$ i ] = $ token ;
159
+ if (isset ($ this ->tokens [$ i + 1 ]) && $ this ->tokens [$ i + 1 ][0 ] === \T_WHITESPACE ) {
160
+ // Move trailing newline into following T_WHITESPACE token, if it already exists.
161
+ $ this ->tokens [$ i + 1 ][1 ] = $ trailingNewline . $ this ->tokens [$ i + 1 ][1 ];
162
+ $ this ->tokens [$ i + 1 ][2 ]--;
163
+ } else {
164
+ // Otherwise, we need to create a new T_WHITESPACE token.
165
+ array_splice ($ this ->tokens , $ i + 1 , 0 , [
166
+ [\T_WHITESPACE , $ trailingNewline , $ line ],
167
+ ]);
168
+ $ numTokens ++;
169
+ }
170
+ }
171
+
181
172
$ tokenValue = \is_string ($ token ) ? $ token : $ token [1 ];
182
173
$ tokenLen = \strlen ($ tokenValue );
183
174
0 commit comments