From c4862ef227f30378ca75ad6c889830352b0e0058 Mon Sep 17 00:00:00 2001
From: ksurent <ksurent@gmail.com>
Date: Mon, 4 Apr 2016 17:09:40 +0200
Subject: [PATCH 1/2] basic tests for checking byte offsets of tokens

---
 t/08_regression.t    |   2 +
 t/29_token_offsets.t | 212 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+)
 create mode 100644 t/29_token_offsets.t
diff --git a/t/08_regression.t b/t/08_regression.t
index 5b7bee8b..872b292e 100644
--- a/t/08_regression.t
+++ b/t/08_regression.t
@@ -69,6 +69,7 @@ SCOPE: {
 	# Check the regexp matches what we would expect (specifically
 	# the fine details about the sections.
 	my $expected = {
+		_byte_start => 0,
 		_sections => 2,
 		braced    => 1,
 		content   => 's {foo} <bar>i',
@@ -99,6 +100,7 @@ SCOPE: {
 
 	# Check the internal details as before
 	my $expected = {
+		_byte_start => 0,
 		_sections => 2,
 		_error    => "No second section of regexp, or does not start with a balanced character",
 		braced    => 1,
diff --git a/t/29_token_offsets.t b/t/29_token_offsets.t
new file mode 100644
index 00000000..cd74bd1e
--- /dev/null
+++ b/t/29_token_offsets.t
@@ -0,0 +1,212 @@
+#!/usr/bin/perl
+
+use utf8;
+use open qw(:std :utf8);
+use strict;
+BEGIN {
+	no warnings 'once';
+	$| = 1;
+	$PPI::XS_DISABLE = 1;
+	$PPI::Lexer::X_TOKENIZER ||= $ENV{X_TOKENIZER};
+}
+
+use Test::More tests => 12;
+use Test::NoWarnings;
+use PPI;
+
+my @tests = (
+[
+	"use strict;",
+	[
+		[0, 2],   # use
+		[3, 3],   # whitespace
+		[4, 9],   # strict
+		[10, 10], # ;
+	],
+],
+[
+	"use strict;\nuse warnings;",
+	[
+		[0, 2],   # use
+		[3, 3],   # whitespace
+		[4, 9],   # strict
+		[10, 10], # ;
+		[11, 11], # newline
+		[12, 14], # use
+		[15, 15], # whitespace
+		[16, 23], # warnings
+		[24, 24], # ;
+	],
+],
+[
+	"my \$var = <<EOT;\nheredocs <3\nEOT\n",
+	[
+		[0, 1],   # my
+		[2, 2],   # whitespace
+		[3, 6],   # $var
+		[7, 7],   # whitespace
+		[8, 8],   # =
+		[9, 9],   # whitespace
+		undef,    # heredoc
+		[15, 15], # ;
+		[16, 16], # newline
+	],
+],
+[
+	"foobar(<<EOT);\nheredocs <3\nEOT\n",
+	[
+		[0, 5],   # foobar
+		[6, 6],   # (
+		undef,    # heredoc
+		[12, 12], # )
+		[13, 13], # ;
+		[14, 14], # newline
+	],
+],
+[
+	"foobar(<<EOT);\nheredocs <3\nEOT\nmy \$var = <<EOT;\nheredocs <3\nEOT\n",
+	[
+		[0, 5],   # foobar
+		[6, 6],   # (
+		undef,    # heredoc
+		[12, 12], # )
+		[13, 13], # ;
+		[14, 14], # newline
+		[31, 32], # my
+		[33, 33], # whitespace
+		[34, 37], # $var
+		[38, 38], # whitespace
+		[39, 39], # =
+		[40, 40], # whitespace
+		undef,    # heredoc
+		[46, 46], # ;
+		[47, 47], # newline
+	],
+],
+[
+	"foobar(<<EOT, \$var1);\nheredocs <3\nEOT\nmy \$var2;",
+	[
+		[0, 5],   # foobar
+		[6, 6],   # (
+		undef,    # heredoc
+		[12, 12], # ,
+		[13, 13], # whitespace
+		[14, 18], # $var1
+		[19, 19], # )
+		[20, 20], # ;
+		[21, 21], # \n
+		[38, 39], # my
+		[40, 40], # whitespace
+		[41, 45], # $var2
+		[46, 46], # ;
+	],
+],
+[
+	"foobar(<<EOT1, <<EOT2);\nheredocs <3\nEOT1\nheredocs <3\nEOT2\nmy \$var;",
+	[
+		[0, 5],   # foobar
+		[6, 6],   # (
+		undef,    # heredoc
+		[13, 13], # ,
+		[14, 14], # whitespace
+		undef,    # heredoc
+		[21, 21], # )
+		[22, 22], # ;
+		[23, 23], # \n
+		[58, 59], # my
+		[60, 60], # whitespace
+		[61, 64], # $var
+		[65, 65], # ;
+	],
+],
+[
+	"foobar(<<EOT1, \$var, <<EOT2);\nheredocs <3\nEOT1\nheredocs <3\nEOT2\nmy \$var2;",
+	[
+		[0, 5],   # foobar
+		[6, 6],   # (
+		undef,    # heredoc
+		[13, 13], # ,
+		[14, 14], # whitespace
+		[15, 18], # $var
+		[19, 19], # ,
+		[20, 20], # whitespace
+		undef,    # heredoc
+		[27, 27], # )
+		[28, 28], # ;
+		[29, 29], # \n
+		[64, 65], # my
+		[66, 66], # whitespace
+		[67, 71], # $var2
+		[72, 72], # ;
+	],
+],
+[
+	"foobar(<<EOT1, \$var, <<EOT2, \$var2);\nheredocs <3\nEOT1\nheredocs <3\nEOT2\nmy \$var3;",
+	[
+		[0, 5],   # foobar
+		[6, 6],   # (
+		undef,    # heredoc
+		[13, 13], # ,
+		[14, 14], # whitespace
+		[15, 18], # $var
+		[19, 19], # ,
+		[20, 20], # whitespace
+		undef,    # heredoc
+		[27, 27], # whitespace
+		[28, 28], # comma
+		[29, 33], # $var2
+		[34, 34], # )
+		[35, 35], # ;
+		[36, 36], # \n
+		[71, 72], # my
+		[73, 73], # whitespace
+		[74, 78], # $var3
+		[79, 79], # ;
+	],
+],
+[
+	"my \@lines = <<EOT =~ /regex/;\nheredocs <3\nEOT\n",
+	[
+		[0, 1],   # my
+		[2, 2],   # whitespace
+		[3, 8],   # @lines
+		[9, 9],   # whitespace
+		[10, 10], # =
+		[11, 11], # whitespace
+		undef,    # heredoc
+		[17, 17], # whitespace
+		[18, 19], # =~
+		[20, 20], # whitespace
+		[21, 27], # /regex/
+		[28, 28], # ;
+		[29, 29], # \n
+	],
+],
+[
+	"my \$var = 'こんにちは';",
+	[
+		[0, 1],   # my
+		[2, 2],   # whitespace
+		[3, 6],   # $var
+		[7, 7],   # whitespace
+		[8, 8],   # =
+		[9, 9],   # whitespace
+		[10, 26], # 'こんにちは'
+		[27, 27], # ;
+	],
+],
+);
+
+for my $t (@tests) {
+	my $Document = PPI::Document->new( \$t->[0] );
+
+	my $ok = is_deeply(
+		[ map($_->byte_span, $Document->tokens) ],
+		$t->[1],
+		"Tokens have correct byte spans"
+	);
+
+	unless($ok) {
+		diag($t->[0]);
+	}
+}

From 49aa38382fc8ee49ea2cfe8ae4df1d9e6c0d7aa3 Mon Sep 17 00:00:00 2001
From: ksurent <ksurent@gmail.com>
Date: Mon, 4 Apr 2016 17:10:50 +0200
Subject: [PATCH 2/2] add byte_span() to PPI::Token

byte_span() allows to refer to specific ranges of source code within a document

note that this works on Token level to allow referring to a specific token
within an Element (e.g. PPI::Token::Word within PPI::Statement::Include)

for every token we now store its starting byte position and ending
byte is deduced as start byte + token byte length

this works for all token types except HereDoc, which requires some
extra bookkeeping
see comments in lib/PPI/Token/HereDoc.pm for further details
---
 lib/PPI/Normal.pm        |  5 ++++
 lib/PPI/Token.pm         | 57 +++++++++++++++++++++++++++++++++++++++-
 lib/PPI/Token/HereDoc.pm | 18 ++++++++++++-
 lib/PPI/Tokenizer.pm     | 19 ++++++++++++--
 4 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/lib/PPI/Normal.pm b/lib/PPI/Normal.pm
index 7c5e9c43..fe991b0b 100644
--- a/lib/PPI/Normal.pm
+++ b/lib/PPI/Normal.pm
@@ -194,6 +194,11 @@ sub process {
 		&{"$function"}( $self->{Document} );
 	}
 
+	# Reset token offsets as they are not valid anymore
+	for my $token ($self->{Document}->tokens) {
+		$token->{_byte_start} = -1;
+	}
+
 	# Create the normalized Document object
 	my $Normalized = PPI::Document::Normalized->new(
 		Document  => $self->{Document},
diff --git a/lib/PPI/Token.pm b/lib/PPI/Token.pm
index 81660f91..e3267dbe 100644
--- a/lib/PPI/Token.pm
+++ b/lib/PPI/Token.pm
@@ -25,6 +25,8 @@ use Params::Util   qw{_INSTANCE};
 use PPI::Element   ();
 use PPI::Exception ();
 
+require bytes;
+
 use vars qw{$VERSION @ISA};
 BEGIN {
 	$VERSION = '1.220';
@@ -83,7 +85,10 @@ use PPI::Token::Unknown               ();
 # Constructor and Related
 
 sub new {
-	bless { content => (defined $_[1] ? "$_[1]" : '') }, $_[0];
+	bless {
+		content     => (defined $_[1] ? "$_[1]" : ''),
+		_byte_start => (defined $_[2] ? $_[2] : -1),
+	}, $_[0];
 }
 
 sub set_class {
@@ -158,6 +163,56 @@ The C<length> method returns the length of the string in a Token.
 sub length { CORE::length($_[0]->{content}) }
 
 
+=pod
+
+=head2 byte_span
+
+Returns an arrayref with zero-based offsets of the first and last bytes of that
+Token.
+
+Offsets are absolute byte positions within a Document, meaning the very first
+byte of the first token is always at position zero and the last byte of the
+last token is always at position I<document size in bytes - 1>.
+
+Example:
+
+	my $Document = PPI::Document->new( \'my $var = 42;' );
+	[ map($_->byte_span, $Document->tokens) ];
+
+will produce the following:
+
+	[
+		[0, 2],   # my
+		[3, 3],   # whitespace
+		[4, 7],   # $var
+		[8, 8],   # whitespace
+		[9, 9],   # =
+		[10, 10], # whitespace
+		[11, 12], # 42
+		[13, 13], # ;
+	]
+
+Returns C<undef> for tokens with unknown position (e.g. tokens not attached to
+a Document).
+
+For some token types computing a byte span is not supported. Currently there's
+only one unsupported type: L<PPI::Token::HereDoc>.
+Tokens of that type still contribute to the total size of the Document but do
+not have a span of their own (meaning this method will return C<undef>).
+
+Normalising a Document invalidates offsets of all tokens, making this method
+return C<undef>.
+
+B<NOTE>: as the method name suggests, offsets are caclulated in bytes, not
+characters.
+
+=cut
+
+sub byte_span {
+	my $start = $_[0]->{_byte_start};
+	return undef if $start < 0;
+	[ $start, $start + bytes::length($_[0]->{content}) - 1 ];
+}
 
 
 
diff --git a/lib/PPI/Token/HereDoc.pm b/lib/PPI/Token/HereDoc.pm
index 86e2a75d..3ff045f1 100644
--- a/lib/PPI/Token/HereDoc.pm
+++ b/lib/PPI/Token/HereDoc.pm
@@ -93,7 +93,7 @@ BEGIN {
 	@ISA     = 'PPI::Token';
 }
 
-
+require bytes;
 
 
 
@@ -214,6 +214,16 @@ sub __TOKENIZER__on_char {
 			# when we are re-assembling the file
 			$token->{_terminator_line} = $line;
 
+			# Actual content and terminator are not included when
+			# computing a HereDoc's byte length so we need to stash
+			# it so that we can manually fixup offsets later
+			#
+			# A line may contain multiple heredocs and we need them
+			# all so we're adding the length, not overwriting it
+			$t->{__current_heredoc_byte_length} +=
+				bytes::length(join("", @heredoc)) +
+				bytes::length($line);
+
 			# The HereDoc is now fully parsed
 			return $t->_finalize_token->__TOKENIZER__on_char( $t );
 		}
@@ -252,6 +262,12 @@ sub __TOKENIZER__on_char {
 	$t->_finalize_token->__TOKENIZER__on_char( $t );
 }
 
+# override byte_span() from the parent class because
+# for heredocs byte offsets are undefined
+sub byte_span {
+    return undef;
+}
+
 1;
 
 =pod
diff --git a/lib/PPI/Tokenizer.pm b/lib/PPI/Tokenizer.pm
index 58fdf8c2..7b0622ef 100644
--- a/lib/PPI/Tokenizer.pm
+++ b/lib/PPI/Tokenizer.pm
@@ -92,6 +92,8 @@ BEGIN {
 	$VERSION = '1.220';
 }
 
+require bytes;
+
 # The x operator cannot follow most Perl operators, implying that
 # anything beginning with x following an operator is a word.
 # These are the exceptions.
@@ -146,6 +148,11 @@ sub new {
 		class        => 'PPI::Token::BOM',
 		zone         => 'PPI::Token::Whitespace',
 
+		# Bookkeeping needed to track byte offsets
+		file_byte_cursor => 0,
+		__total_heredoc_byte_length   => 0,
+		__current_heredoc_byte_length => 0,
+
 		# Output token buffer
 		tokens       => [],
 		token_cursor => 0,
@@ -464,6 +471,8 @@ sub _fill_line {
 	$self->{line_length} = length $line;
 	$self->{line_count}++;
 
+	$self->{__total_heredoc_byte_length} += $self->{__current_heredoc_byte_length};
+
 	1;
 }
 
@@ -600,10 +609,16 @@ sub _process_next_char {
 # Returns the resulting parse class as a convenience.
 sub _finalize_token {
 	my $self = shift;
-	return $self->{class} unless defined $self->{token};
+
+	defined(my $tok = $self->{token}) or return $self->{class};
+
+	# Include heredoc content and terminators
+	$tok->{_byte_start} = $self->{file_byte_cursor} + $self->{__total_heredoc_byte_length};
+
+	$self->{file_byte_cursor} += bytes::length($tok->{content});
 
 	# Add the token to the token buffer
-	push @{ $self->{tokens} }, $self->{token};
+	push @{ $self->{tokens} }, $tok;
 	$self->{token} = undef;
 
 	# Return the parse class to that of the zone we are in