Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: status-codes (fixes #161) #164

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions demo.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?php
spekulatius marked this conversation as resolved.
Show resolved Hide resolved

require __DIR__.'/vendor/autoload.php';

// ENTER YOUR URL HERE:
$url = 'http://github.com/spekulatius/PHPScraper';
echo 'requesting ', $url, "\n";
$web = new \Spekulatius\PHPScraper\PHPScraper();
$web->go($url);

if ($web->currentUrl !== $url) {
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
echo 'redirected to ', $web->currentUrl, "\n";
}
echo 'status code ', $web->statusCode, "\n";

if ($web->isGone) {
echo "delete/deactivate record from database\n";
} else {
if ($web->permanentRedirectUrl !== '') {
echo 'url changed - update url in database to ', $web->permanentRedirectUrl, "\n";
}

$retryAt = $web->retryAt;
if ($web->isSuccess) {
echo "got data successfully - process it now...\n";
} elseif ($web->isTemporaryResult) {
echo "temporary error\n";
if (!$retryAt) {
$retryAt = time() + 15*60;
} // FIXME: use longer times if we get the same status code multiple times
} else {
echo "might be a permanent error - but who knows if the server changes its mind (e.g. if the result is caused by some administrative work on the server) --> try several times before considering it final\n";
if (!$retryAt) {
$retryAt = time() + 24*60*60;
} // FIXME: use longer times if we get the same status code multiple times OR consider it somewhen really permanent and delete/deactivate record from database
}
if ($retryAt) {
echo 'retry at ', date('Y-m-d H:i:s', $retryAt), "\n";
}
}
121 changes: 121 additions & 0 deletions src/GoutteClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
<?php

namespace Spekulatius\PHPScraper;

use Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;

/**
* Extended Goutte\Client with PHPScraper specific methods
*/
class GoutteClient extends Client
{
/**
* Was a temporary redirect involved in loading this request?
*
* @var bool
*/
public $usesTemporaryRedirect = false;

/**
* Should subsequent requests go to a different URL?
*
* @var string
*/
public $permanentRedirectUrl = null;

/**
* Which is the earliest moment to retry the request because of an outdated redirect? (unix timestamp)
*
* @var int
*/
protected $retryRedirectAt = PHP_INT_MAX;

/**
* Which is the earliest moment to retry the request because of a failed request? (unix timestamp)
*
* @var int
*/
protected $retryFailureAt = 0;

/**
* Reset internal variables
*/
public function initNewRequest()
{
$this->usesTemporaryRedirect = false;
$this->permanentRedirectUrl = null;
$this->retryRedirectAt = PHP_INT_MAX;
$this->retryFailureAt = 0;
}

/**
* Remember permanent redirect url and detect if the redirect chain contains temporary redirects
*
* @return Crawler
*/
public function followRedirect(): Crawler
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
{
$status = $this->internalResponse->getStatusCode();
if ($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) {
if (!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After'))) {
$this->permanentRedirectUrl = $this->redirect;
}
} else { // $status === 300 /* Multiple Choices */ || $status === 302 /* Found */ || $status === 303 /* See Other */ || $status === 307 /* Temporary Redirect */
$this->usesTemporaryRedirect = true;
}
// 300 Multiple Choices might also be handled as permanent redirect
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
// META REFRESH might also be handled as temporary redirect if the delay is > 1s
return parent::followRedirect();
}

/**
* Evaluate the Retry-After header
*
* see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After
*
* @return Response
*/
protected function filterResponse(object $response)
{
$retryAfterHeaders = $response->getHeader('Retry-After', false);
if (!empty($retryAfterHeaders)) {
$status = $response->getStatusCode();
foreach ($retryAfterHeaders as $retryAfter) {
if (is_numeric($retryAfter)) {
$retryAt = time() + $retryAfter;
} else {
$retryAt = strtotime($retryAfter);
}
if ($status >= 400) { // usually 429 Too Many Request or 503 Service Unavailable
if ($this->retryFailureAt < $retryAt) {
$this->retryFailureAt = $retryAt;
}
} elseif ($status >= 300) {
if ($this->retryRedirectAt > $retryAt) {
$this->retryRedirectAt = $retryAt;
}
}
}
}

return parent::filterResponse($response);
}

/**
* Calculate the earliest moment to retry the request
*
* @return Response
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
*/
public function retryAt(): int
{
if ($this->retryFailureAt) {
return $this->retryFailureAt;
}
if ($this->retryRedirectAt < PHP_INT_MAX) {
return $this->retryRedirectAt;
}

return 0;
}
}
1 change: 0 additions & 1 deletion src/PHPScraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
* Most calls are passed through to the Core class.
*/

use Goutte\Client as GoutteClient;
use Symfony\Component\HttpClient\HttpClient as SymfonyHttpClient;

class PHPScraper
Expand Down
103 changes: 101 additions & 2 deletions src/UsesGoutte.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

namespace Spekulatius\PHPScraper;

use Goutte\Client as GoutteClient;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;

Expand All @@ -29,6 +28,27 @@ trait UsesGoutte
*/
protected $currentPage = null;

/**
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
* Was a temporary redirect involved in loading this request?
*
* @var bool
*/
protected $usesTemporaryRedirect = false;

/**
* Should subsequent requests go to a different URL?
*
* @var string
*/
protected $permanentRedirectUrl = '';

/**
* Which is the earliest moment to retry the request? (unix timestamp)
*
* @var int
*/
protected $retryAt = 0;

/**
* Overwrites the client
*
Expand Down Expand Up @@ -74,9 +94,19 @@ public function client(): GoutteClient
*/
public function go(string $url): self
{
$this->client->initNewRequest();

// Keep it around for internal processing.
$this->currentPage = $this->client->request('GET', $url);

// Remember request properties.
$this->usesTemporaryRedirect = $this->client->usesTemporaryRedirect;
$this->permanentRedirectUrl = $this->client->permanentRedirectUrl ?? '';
$this->retryAt = $this->client->retryAt();
if (!$this->retryAt && $this->statusCode() === 509 /* Bandwidth Limit Exceeded */) {
$this->retryAt = strtotime('next month 12:00 UTC');
// give providers in each timezone the chance to reset the traffic quota for month
}
return $this;
}

Expand Down Expand Up @@ -133,4 +163,73 @@ public function clickLink($titleOrUrl): self

return $this;
}
}

public function isTemporaryResult(): bool
{
return $this->usesTemporaryRedirect || \in_array($this->statusCode(), [
408, // Request Timeout
409, // Conflict
419, // Page Expired
420, // Enhance Your Calm
421, // Misdirected Request
423, // Locked
425, // Too Early
429, // Too Many Requests
500, // Internal Server Error
502, // Bad Gateway
503, // Service Unavailable
504, // Gateway Timeout
507, // Insufficient Storage
520, // Web Server returned an unknown error
521, // Web Server is down
522, // Connection Timed Out
523, // Origin is unreachable
524, // A timeout occurred
525, // SSL Handshake Failed
527, // Railgun Error
529, // Site is overloaded
598, // Network read timeout error
599, // Network Connect Timeout Error
]);
}

public function isGone(): bool
{
return !$this->isTemporaryResult() && $this->statusCode() === 410 /* Gone */;
}

public function isPermanentError(): bool
{
return $this->statusCode() >= 400 && !$this->isTemporaryResult();
}

public function usesTemporaryRedirect(): bool
{
return $this->usesTemporaryRedirect;
}

public function permanentRedirectUrl(): string
{
return $this->permanentRedirectUrl;
}

public function retryAt(): int
{
return $this->retryAt;
}

public function statusCode(): int
{
if ($this->currentPage === null) {
throw new \Exception('You can not access the status code before your first navigation using `go`.');
}

return $this->client->getResponse()->getStatusCode();
}

public function isSuccess(): bool
{
return $this->statusCode() >= 200 && $this->statusCode() <= 299;
}

}
22 changes: 0 additions & 22 deletions tests/NotFoundTest.php

This file was deleted.

Loading