Skip to content

Commit 19008b7

Browse files
committed
Additional rigor when parsing URLs, actually use the public suffix list
1 parent 719eee4 commit 19008b7

19 files changed

+2145
-1812
lines changed

transfer-protocol/.phpunit.result.cache

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

transfer-protocol/bin/regenerate_public_suffix_list.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@
2626
fwrite($fp, "\n * Public suffix list for detecting URLs with known domains within text.");
2727
fwrite($fp, "\n * This file is automatically generated by regenerate_public_suffix_list.php.");
2828
fwrite($fp, "\n * Do not edit it directly.");
29+
fwrite($fp, "\n * @TODO: Process wildcards and exceptions, not just raw TLDs.");
2930
fwrite($fp, "\n */\n\n");
3031
fwrite($fp, "return array(\n");
3132
foreach($tlds as $tld) {
32-
fwrite($fp, "\t'".$tld."',\n");
33+
fwrite($fp, "\t'".$tld."' => 1,\n");
3334
}
3435

3536
fwrite($fp, ");\n");

transfer-protocol/bin/rewrite-urls.php

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
<?php
22

3-
use Rowbot\URL\URL;
4-
53
require_once __DIR__ . "/../bootstrap.php";
64

75
if ( $argc < 2 ) {
8-
echo "Usage: php script.php <command> --file <input-file> --from-site-url <current site url> --to-url <target url>\n";
6+
echo "Usage: php script.php <command> --file <input-file> --current-site-url <current site url> --new-site-url <target url>\n";
97
echo "Commands:\n";
108
echo " list_urls: List all the URLs found in the input file.\n";
119
echo " migrate_urls: Migrate all the URLs found in the input file from the current site to the target site.\n";
@@ -27,26 +25,24 @@
2725
exit( 1 );
2826
}
2927

30-
$inputFile = $options['file'];
31-
$targetDomain = @$options['target-domain'];
32-
28+
$inputFile = $options['file'];
3329
if ( ! file_exists( $inputFile ) ) {
3430
echo "The file $inputFile does not exist.\n";
3531
exit( 1 );
3632
}
37-
3833
$block_markup = file_get_contents( $inputFile );
3934

40-
// @TODO: Should a base URL be always required?
41-
$previous_url = $options['from-site-url'] ?? 'https://w.org';
42-
$p = new WP_Block_Markup_Url_Processor( $block_markup, $previous_url );
35+
// @TODO: Decide – should the current site URL be always required to
36+
// populate $base_url?
37+
$base_url = $options['current-site-url'] ?? 'https://playground.internal';
38+
$p = new WP_Block_Markup_Url_Processor( $block_markup, $base_url );
4339

4440
switch ( $command ) {
4541
case 'list_urls':
4642
echo "URLs found in the markup:\n\n";
4743
while ( $p->next_url() ) {
4844
// Skip empty relative URLs.
49-
if ( ! trim( $p->get_url() ) ) {
45+
if ( ! trim( $p->get_raw_url() ) ) {
5046
continue;
5147
}
5248
echo '* ';
@@ -61,35 +57,53 @@
6157
echo 'In #text: ';
6258
break;
6359
}
64-
echo $p->get_url() . "\n";
60+
echo $p->get_raw_url() . "\n";
6561
}
6662
echo "\n";
6763
break;
6864
case 'migrate_urls':
69-
if ( ! isset( $options['from-site-url'] ) ) {
70-
echo "The --from-site-url option is required for the migrate_urls command.\n";
65+
if ( ! isset( $options['current-site-url'] ) ) {
66+
echo "The --current-site-url option is required for the migrate_urls command.\n";
7167
exit( 1 );
7268
}
73-
if ( ! isset( $options['to-url'] ) ) {
74-
echo "The --to-url option is required for the migrate_urls command.\n";
69+
if ( ! isset( $options['new-site-url'] ) ) {
70+
echo "The --new-site-url option is required for the migrate_urls command.\n";
7571
exit( 1 );
7672
}
77-
$parsed_prev_url = URL::parse( $options['from-site-url'] );
78-
$next_url = $options['to-url'];
79-
$parsed_new_url = URL::parse( $next_url );
80-
echo "Replacing $previous_url with $next_url in the input.\n";
73+
$parsed_current_site_url = WP_URL::parse( $options['current-site-url'] );
74+
$string_new_site_url = $options['new-site-url'];
75+
$parsed_new_site_url = WP_URL::parse( $string_new_site_url );
76+
77+
echo "Replacing $base_url with $string_new_site_url in the input.\n";
8178
echo "Note this is not yet enough to migrate the site as both the previous and the new";
8279
echo "site might be hosted on specific paths.\n\n";
8380
while ( $p->next_url() ) {
84-
$updated = false;
85-
$url = $p->get_url();
86-
$parsed_url = URL::parse( $url, $parsed_prev_url );
87-
if ( $parsed_url->hostname === $parsed_prev_url->hostname ) {
88-
$parsed_url->hostname = $parsed_new_url->hostname;
89-
if ( str_starts_with( $parsed_url->pathname, $parsed_prev_url->pathname ) ) {
90-
$parsed_url->pathname = $parsed_new_url->pathname . substr( $parsed_url->pathname, strlen( $parsed_prev_url->pathname ) );
81+
$updated = false;
82+
$matched_url = $p->get_raw_url();
83+
$parsed_matched_url = $p->get_parsed_url();
84+
if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) {
85+
$parsed_matched_url->hostname = $parsed_new_site_url->hostname;
86+
if ( str_starts_with( $parsed_matched_url->pathname, $parsed_current_site_url->pathname ) ) {
87+
$parsed_matched_url->pathname = $parsed_new_site_url->pathname . substr( $parsed_matched_url->pathname,
88+
strlen( $parsed_current_site_url->pathname ) );
89+
}
90+
91+
/*
92+
* Stylistic choice – if the matched URL has no trailing slash,
93+
* do not add it to the new URL. The WHATWG URL parser will
94+
* add one automatically if the path is empty, so we have to
95+
* explicitly remove it.
96+
*/
97+
$new_raw_url = $parsed_matched_url->toString();
98+
if (
99+
$matched_url[ strlen( $matched_url ) - 1 ] !== '/' &&
100+
$parsed_matched_url->pathname === '/' &&
101+
$parsed_matched_url->search === '' &&
102+
$parsed_matched_url->hash === ''
103+
) {
104+
$new_raw_url = rtrim( $new_raw_url, '/' );
91105
}
92-
$p->set_url( $parsed_url->toString() );
106+
$p->set_raw_url( $new_raw_url );
93107
}
94108
}
95109
echo $p->get_updated_html();

transfer-protocol/bootstrap.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
require_once __DIR__ . '/src/WP_Block_Markup_Processor.php';
2424
require_once __DIR__ . '/src/WP_Block_Markup_Url_Processor.php';
2525
require_once __DIR__ . '/src/WP_Migration_URL_In_Text_Processor.php';
26+
require_once __DIR__ . '/src/WP_URL.php';
2627
require_once __DIR__ . '/vendor/autoload.php';
2728

2829
function _doing_it_wrong() {

transfer-protocol/married-short.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<!-- /wp:image -->
44

55
<!-- wp:paragraph -->
6-
<p>During the <a href="writeofpassage.school">Write of Passage</a>, I stubbornly tried to beat my writer’s block by writing until 3am multiple times. The burnout returned. I dropped everything and went to Greece for a week.</p>
6+
<p>During the <a href="//writeofpassage.school/">Write of Passage</a>, I stubbornly tried to beat my writer’s block by writing until 3am multiple times. The burnout returned. I dropped everything and went to Greece for a week.</p>
77
<!-- /wp:paragraph -->
88

99
<!-- wp:paragraph -->

transfer-protocol/phpunit.xml

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
1-
<phpunit
2-
bootstrap="bootstrap.php"
3-
colors="true"
4-
verbose="true"
5-
>
6-
<testsuites>
7-
<testsuite name="Project Test Suite">
8-
<directory>./tests</directory>
9-
</testsuite>
10-
</testsuites>
1+
<?xml version="1.0"?>
2+
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" bootstrap="bootstrap.php" colors="true" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.0/phpunit.xsd" cacheDirectory=".phpunit.cache">
3+
<testsuites>
4+
<testsuite name="Application Test Suite">
5+
<file>tests/WPBlockMarkupProcessorTests.php</file>
6+
<file>tests/WPBlockMarkupUrlProcessorTests.php</file>
7+
<file>tests/WPMigrationURLInTextProcessorTests.php</file>
8+
</testsuite>
9+
</testsuites>
1110
</phpunit>

transfer-protocol/run-tests.sh

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,4 @@
11
#!/bin/bash
2-
#COMMAND="phpunit tests/WP_Migration_*"
3-
#COMMAND="phpunit tests/*.php"
4-
#COMMAND="phpunit tests/WP_Block_Markup_Url_Processor_Tests.php"
5-
#COMMAND="phpunit -c phpunit.xml"
6-
#$COMMAND
7-
#fswatch -o ./**/*.php | xargs -n1 -I{} $COMMAND
8-
9-
for i in $(ls tests/*.php | grep -v URL_Parser); do
10-
phpunit $i
11-
done
2+
COMMAND="phpunit -c ./phpunit.xml"
3+
$COMMAND
4+
fswatch -o ./**/*.php | xargs -n1 -I{} $COMMAND

transfer-protocol/src/WP_Block_Markup_Processor.php

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -144,31 +144,6 @@ public function get_block_attributes() {
144144
return $this->block_attributes;
145145
}
146146

147-
public function set_block_attributes( array $new_attributes ) {
148-
if ( null === $this->block_name ) {
149-
_doing_it_wrong(
150-
__METHOD__,
151-
__( 'Cannot set block attributes when not in `block_attributes` state' ),
152-
'WP_VERSION'
153-
);
154-
155-
return false;
156-
}
157-
158-
if ( null !== $this->block_attributes_iterator ) {
159-
_doing_it_wrong(
160-
__METHOD__,
161-
__( 'Cannot override all the block attributes when iterating over the existing attributes with next_block_attribute()' ),
162-
'WP_VERSION'
163-
);
164-
165-
return false;
166-
}
167-
168-
$this->block_attributes_updated = true;
169-
$this->block_attributes = $new_attributes;
170-
}
171-
172147
public function is_block_closer() {
173148
return $this->block_name !== null && $this->block_closer === true;
174149
}

transfer-protocol/src/WP_Block_Markup_Url_Processor.php

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
*/
88
class WP_Block_Markup_Url_Processor extends WP_Block_Markup_Processor {
99

10-
private $url;
10+
private $raw_url;
11+
/**
12+
* @var URL
13+
*/
14+
private $parsed_url;
1115
private $base_url;
1216
private $url_in_text_processor;
1317
private $url_in_text_node_updated;
@@ -27,14 +31,19 @@ public function get_updated_html() {
2731
return parent::get_updated_html();
2832
}
2933

30-
public function get_url() {
31-
return $this->url;
34+
public function get_raw_url() {
35+
return $this->raw_url;
36+
}
37+
38+
public function get_parsed_url() {
39+
return $this->parsed_url;
3240
}
3341

3442
public function next_token() {
3543
$this->get_updated_html();
3644

37-
$this->url = null;
45+
$this->raw_url = null;
46+
$this->parsed_url = null;
3847
$this->inspected_url_attribute_idx = - 1;
3948
$this->url_in_text_processor = null;
4049
// Do not reset url_in_text_node_updated – it's reset in get_updated_html() which
@@ -54,7 +63,7 @@ public function next_url() {
5463
}
5564

5665
public function next_url_in_current_token() {
57-
$this->url = null;
66+
$this->raw_url = null;
5867
switch ( parent::get_token_type() ) {
5968
case '#tag':
6069
return $this->next_url_attribute();
@@ -68,17 +77,11 @@ public function next_url_in_current_token() {
6877
}
6978

7079
private function next_url_in_text_node() {
71-
7280
if ( $this->get_token_type() !== '#text' ) {
7381
return false;
7482
}
7583

7684
if ( null === $this->url_in_text_processor ) {
77-
$this->url_in_text_processor = new WP_Migration_URL_In_Text_Processor( $this->get_modifiable_text() );
78-
}
79-
80-
while ( $this->url_in_text_processor->next_url() ) {
81-
$url = $this->url_in_text_processor->get_url();
8285
/*
8386
* Use the base URL for URLs matched in text nodes. This is the only
8487
* way to recognize a substring "WordPress.org" as a URL. We might
@@ -90,11 +93,14 @@ private function next_url_in_text_node() {
9093
* to filter out such false positives e.g. by checking the domain against
9194
* a list of accepted domains, or the TLD against a list of public suffixes.
9295
*/
93-
if ( URL::canParse( $url, $this->base_url ) ) {
94-
$this->url = $url;
96+
$this->url_in_text_processor = new WP_Migration_URL_In_Text_Processor( $this->get_modifiable_text(), $this->base_url );
97+
}
9598

96-
return true;
97-
}
99+
while ( $this->url_in_text_processor->next_url() ) {
100+
$this->raw_url = $this->url_in_text_processor->get_raw_url();
101+
$this->parsed_url = $this->url_in_text_processor->get_parsed_url();
102+
103+
return true;
98104
}
99105

100106
return false;
@@ -109,7 +115,7 @@ private function next_url_attribute() {
109115
return false;
110116
}
111117

112-
while ( ++$this->inspected_url_attribute_idx < count( self::URL_ATTRIBUTES[ $tag ] ) ) {
118+
while ( ++ $this->inspected_url_attribute_idx < count( self::URL_ATTRIBUTES[ $tag ] ) ) {
113119
$attr = self::URL_ATTRIBUTES[ $tag ][ $this->inspected_url_attribute_idx ];
114120
if ( false === $attr ) {
115121
return false;
@@ -123,10 +129,14 @@ private function next_url_attribute() {
123129
* be correctly recognized as a URL.
124130
* Without a base URL, this Processor would incorrectly skip it.
125131
*/
126-
if ( is_string( $url_maybe ) && URL::canParse( $url_maybe, $this->base_url ) ) {
127-
$this->url = $url_maybe;
132+
if ( is_string( $url_maybe ) ) {
133+
$parsed_url = WP_URL::parse( $url_maybe, $this->base_url );
134+
if ( false !== $parsed_url ) {
135+
$this->raw_url = $url_maybe;
136+
$this->parsed_url = $parsed_url;
128137

129-
return true;
138+
return true;
139+
}
130140
}
131141
}
132142

@@ -143,18 +153,22 @@ private function next_url_block_attribute() {
143153
* When a base URL is missing, the string must start with a protocol to
144154
* be considered a URL.
145155
*/
146-
if ( is_string( $url_maybe ) && URL::canParse( $url_maybe ) ) {
147-
$this->url = $url_maybe;
156+
if ( is_string( $url_maybe ) ) {
157+
$parsed_url = WP_URL::parse( $url_maybe );
158+
if ( false !== $parsed_url ) {
159+
$this->raw_url = $url_maybe;
160+
$this->parsed_url = $parsed_url;
148161

149-
return true;
162+
return true;
163+
}
150164
}
151165
}
152166

153167
return false;
154168
}
155169

156-
public function set_url( $new_url ) {
157-
if ( null === $this->url ) {
170+
public function set_raw_url( $new_url ) {
171+
if ( null === $this->raw_url ) {
158172
return false;
159173
}
160174
switch ( parent::get_token_type() ) {
@@ -176,7 +190,7 @@ public function set_url( $new_url ) {
176190
}
177191
$this->url_in_text_node_updated = true;
178192

179-
return $this->url_in_text_processor->set_url( $new_url );
193+
return $this->url_in_text_processor->set_raw_url( $new_url );
180194
}
181195
}
182196

0 commit comments

Comments
 (0)