From c4862ef227f30378ca75ad6c889830352b0e0058 Mon Sep 17 00:00:00 2001 From: ksurent Date: Mon, 4 Apr 2016 17:09:40 +0200 Subject: [PATCH 1/2] basic tests for checking byte offsets of tokens --- t/08_regression.t | 2 + t/29_token_offsets.t | 212 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 t/29_token_offsets.t diff --git a/t/08_regression.t b/t/08_regression.t index 5b7bee8b..872b292e 100644 --- a/t/08_regression.t +++ b/t/08_regression.t @@ -69,6 +69,7 @@ SCOPE: { # Check the regexp matches what we would expect (specifically # the fine details about the sections. my $expected = { + _byte_start => 0, _sections => 2, braced => 1, content => 's {foo} i', @@ -99,6 +100,7 @@ SCOPE: { # Check the internal details as before my $expected = { + _byte_start => 0, _sections => 2, _error => "No second section of regexp, or does not start with a balanced character", braced => 1, diff --git a/t/29_token_offsets.t b/t/29_token_offsets.t new file mode 100644 index 00000000..cd74bd1e --- /dev/null +++ b/t/29_token_offsets.t @@ -0,0 +1,212 @@ +#!/usr/bin/perl + +use utf8; +use open qw(:std :utf8); +use strict; +BEGIN { + no warnings 'once'; + $| = 1; + $PPI::XS_DISABLE = 1; + $PPI::Lexer::X_TOKENIZER ||= $ENV{X_TOKENIZER}; +} + +use Test::More tests => 12; +use Test::NoWarnings; +use PPI; + +my @tests = ( +[ + "use strict;", + [ + [0, 2], # use + [3, 3], # whitespace + [4, 9], # strict + [10, 10], # ; + ], +], +[ + "use strict;\nuse warnings;", + [ + [0, 2], # use + [3, 3], # whitespace + [4, 9], # strict + [10, 10], # ; + [11, 11], # newline + [12, 14], # use + [15, 15], # whitespace + [16, 23], # warnings + [24, 24], # ; + ], +], +[ + "my \$var = <new( \$t->[0] ); + + my $ok = is_deeply( + [ map($_->byte_span, $Document->tokens) ], + $t->[1], + "Tokens have correct byte spans" + ); + + unless($ok) { + diag($t->[0]); + } +} From 49aa38382fc8ee49ea2cfe8ae4df1d9e6c0d7aa3 Mon Sep 17 00:00:00 2001 From: ksurent Date: Mon, 4 Apr 2016 17:10:50 +0200 Subject: [PATCH 2/2] add byte_span() to PPI::Token byte_span() allows to refer to specific ranges of source code within a document note that this works on Token level to allow referring to a specific token within an Element (e.g. PPI::Token::Word within PPI::Statement::Include) for every token we now store its starting byte position and ending byte is deduced as start byte + token byte length this works for all token types except HereDoc, which requires some extra bookkeeping see comments in lib/PPI/Token/HereDoc.pm for further details --- lib/PPI/Normal.pm | 5 ++++ lib/PPI/Token.pm | 57 +++++++++++++++++++++++++++++++++++++++- lib/PPI/Token/HereDoc.pm | 18 ++++++++++++- lib/PPI/Tokenizer.pm | 19 ++++++++++++-- 4 files changed, 95 insertions(+), 4 deletions(-) diff --git a/lib/PPI/Normal.pm b/lib/PPI/Normal.pm index 7c5e9c43..fe991b0b 100644 --- a/lib/PPI/Normal.pm +++ b/lib/PPI/Normal.pm @@ -194,6 +194,11 @@ sub process { &{"$function"}( $self->{Document} ); } + # Reset token offsets as they are not valid anymore + for my $token ($self->{Document}->tokens) { + $token->{_byte_start} = -1; + } + # Create the normalized Document object my $Normalized = PPI::Document::Normalized->new( Document => $self->{Document}, diff --git a/lib/PPI/Token.pm b/lib/PPI/Token.pm index 81660f91..e3267dbe 100644 --- a/lib/PPI/Token.pm +++ b/lib/PPI/Token.pm @@ -25,6 +25,8 @@ use Params::Util qw{_INSTANCE}; use PPI::Element (); use PPI::Exception (); +require bytes; + use vars qw{$VERSION @ISA}; BEGIN { $VERSION = '1.220'; @@ -83,7 +85,10 @@ use PPI::Token::Unknown (); # Constructor and Related sub new { - bless { content => (defined $_[1] ? "$_[1]" : '') }, $_[0]; + bless { + content => (defined $_[1] ? "$_[1]" : ''), + _byte_start => (defined $_[2] ? $_[2] : -1), + }, $_[0]; } sub set_class { @@ -158,6 +163,56 @@ The C method returns the length of the string in a Token. sub length { CORE::length($_[0]->{content}) } +=pod + +=head2 byte_span + +Returns an arrayref with zero-based offsets of the first and last bytes of that +Token. + +Offsets are absolute byte positions within a Document, meaning the very first +byte of the first token is always at position zero and the last byte of the +last token is always at position I. + +Example: + + my $Document = PPI::Document->new( \'my $var = 42;' ); + [ map($_->byte_span, $Document->tokens) ]; + +will produce the following: + + [ + [0, 2], # my + [3, 3], # whitespace + [4, 7], # $var + [8, 8], # whitespace + [9, 9], # = + [10, 10], # whitespace + [11, 12], # 42 + [13, 13], # ; + ] + +Returns C for tokens with unknown position (e.g. tokens not attached to +a Document). + +For some token types computing a byte span is not supported. Currently there's +only one unsupported type: L. +Tokens of that type still contribute to the total size of the Document but do +not have a span of their own (meaning this method will return C). + +Normalising a Document invalidates offsets of all tokens, making this method +return C. + +B: as the method name suggests, offsets are caclulated in bytes, not +characters. + +=cut + +sub byte_span { + my $start = $_[0]->{_byte_start}; + return undef if $start < 0; + [ $start, $start + bytes::length($_[0]->{content}) - 1 ]; +} diff --git a/lib/PPI/Token/HereDoc.pm b/lib/PPI/Token/HereDoc.pm index 86e2a75d..3ff045f1 100644 --- a/lib/PPI/Token/HereDoc.pm +++ b/lib/PPI/Token/HereDoc.pm @@ -93,7 +93,7 @@ BEGIN { @ISA = 'PPI::Token'; } - +require bytes; @@ -214,6 +214,16 @@ sub __TOKENIZER__on_char { # when we are re-assembling the file $token->{_terminator_line} = $line; + # Actual content and terminator are not included when + # computing a HereDoc's byte length so we need to stash + # it so that we can manually fixup offsets later + # + # A line may contain multiple heredocs and we need them + # all so we're adding the length, not overwriting it + $t->{__current_heredoc_byte_length} += + bytes::length(join("", @heredoc)) + + bytes::length($line); + # The HereDoc is now fully parsed return $t->_finalize_token->__TOKENIZER__on_char( $t ); } @@ -252,6 +262,12 @@ sub __TOKENIZER__on_char { $t->_finalize_token->__TOKENIZER__on_char( $t ); } +# override byte_span() from the parent class because +# for heredocs byte offsets are undefined +sub byte_span { + return undef; +} + 1; =pod diff --git a/lib/PPI/Tokenizer.pm b/lib/PPI/Tokenizer.pm index 58fdf8c2..7b0622ef 100644 --- a/lib/PPI/Tokenizer.pm +++ b/lib/PPI/Tokenizer.pm @@ -92,6 +92,8 @@ BEGIN { $VERSION = '1.220'; } +require bytes; + # The x operator cannot follow most Perl operators, implying that # anything beginning with x following an operator is a word. # These are the exceptions. @@ -146,6 +148,11 @@ sub new { class => 'PPI::Token::BOM', zone => 'PPI::Token::Whitespace', + # Bookkeeping needed to track byte offsets + file_byte_cursor => 0, + __total_heredoc_byte_length => 0, + __current_heredoc_byte_length => 0, + # Output token buffer tokens => [], token_cursor => 0, @@ -464,6 +471,8 @@ sub _fill_line { $self->{line_length} = length $line; $self->{line_count}++; + $self->{__total_heredoc_byte_length} += $self->{__current_heredoc_byte_length}; + 1; } @@ -600,10 +609,16 @@ sub _process_next_char { # Returns the resulting parse class as a convenience. sub _finalize_token { my $self = shift; - return $self->{class} unless defined $self->{token}; + + defined(my $tok = $self->{token}) or return $self->{class}; + + # Include heredoc content and terminators + $tok->{_byte_start} = $self->{file_byte_cursor} + $self->{__total_heredoc_byte_length}; + + $self->{file_byte_cursor} += bytes::length($tok->{content}); # Add the token to the token buffer - push @{ $self->{tokens} }, $self->{token}; + push @{ $self->{tokens} }, $tok; $self->{token} = undef; # Return the parse class to that of the zone we are in