diff --git a/README.md b/README.md index 4646425..f9e7ef0 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@ The [Sitemaps.org](http://www.sitemaps.org/) protocol is the leading standard an - Custom User-Agent string - Proxy support - URL blacklist +- request throttling (using https://github.com/hamburgscleanest/guzzle-advanced-throttle) +- retry (using https://github.com/caseyamcl/guzzle_retry_middleware) +- advanced logging (using https://github.com/gmponos/guzzle_logger) ## Formats supported - XML `.xml` @@ -33,7 +36,9 @@ The [Sitemaps.org](http://www.sitemaps.org/) protocol is the leading standard an - [mbstring](http://php.net/manual/en/book.mbstring.php) - [libxml](http://php.net/manual/en/book.libxml.php) _(enabled by default)_ - [SimpleXML](http://php.net/manual/en/book.simplexml.php) _(enabled by default)_ - +- Optional: + - https://github.com/caseyamcl/guzzle_retry_middleware + - https://github.com/hamburgscleanest/guzzle-advanced-throttle ## Installation The library is available for install via [Composer](https://getcomposer.org). Just add this to your `composer.json` file: ```json @@ -143,6 +148,122 @@ try { } ``` +### Throttling + +1. Install middleware: +```bash +composer require hamburgscleanest/guzzle-advanced-throttle +``` +2. Define host rules: + +```php +$rules = new RequestLimitRuleset([ + 'https://www.google.com' => [ + [ + 'max_requests' => 20, + 'request_interval' => 1 + ], + [ + 'max_requests' => 100, + 'request_interval' => 120 + ] + ] +]); +``` +3. Create handler stack: + +```php +$stack = new HandlerStack(); +$stack->setHandler(new CurlHandler()); +``` +4. Create middleware: +```php +$throttle = new ThrottleMiddleware($rules); + + // Invoke the middleware +$stack->push($throttle()); + +// OR: alternatively call the handle method directly +$stack->push($throttle->handle()); +``` +5. Create client manually: +```php +$client = new \GuzzleHttp\Client(['handler' => $stack]); +``` +6. Pass client as an argument or use `setClient` method: +```php +$parser = new SitemapParser(); +$parser->setClient($client); +``` +More details about this middle ware is available [here](https://github.com/hamburgscleanest/guzzle-advanced-throttle) + +### Automatic retry + +1. Install middleware: +```bash +composer require caseyamcl/guzzle_retry_middleware +``` + +2. Create stack: +```php +$stack = new HandlerStack(); +$stack->setHandler(new CurlHandler()); +``` + +3. Add middleware to the stack: +```php +$stack->push(GuzzleRetryMiddleware::factory()); +``` + +4. Create client manually: +```php +$client = new \GuzzleHttp\Client(['handler' => $stack]); +``` + +5. Pass client as an argument or use setClient method: +```php +$parser = new SitemapParser(); +$parser->setClient($client); +``` +More details about this middle ware is available [here](https://github.com/caseyamcl/guzzle_retry_middleware) + +### Advanced logging + +1. Install middleware: +```bash +composer require gmponos/guzzle_logger +``` + +2. Create PSR-3 style logger +```php +$logger = new Logger(); +``` + +3. Create handler stack: + +```php +$stack = new HandlerStack(); +$stack->setHandler(new CurlHandler()); +``` + +5. Push logger middleware to stack +```php +$stack->push(new LogMiddleware($logger)); +``` + +6. Create client manually: +```php +$client = new \GuzzleHttp\Client(['handler' => $stack]); +``` +7. Pass client as an argument or use `setClient` method: +```php +$parser = new SitemapParser(); +$parser->setClient($client); +``` +More details about this middleware config (like log levels, when to log and what to log) is available [here](https://github.com/gmponos/guzzle_logger) + + + ### Additional examples Even more examples available in the [examples](https://github.com/VIPnytt/SitemapParser/tree/master/examples) directory. diff --git a/composer.json b/composer.json index a654f1d..f4f15b3 100644 --- a/composer.json +++ b/composer.json @@ -43,5 +43,10 @@ "psr-4": { "vipnytt\\SitemapParser\\Tests\\": "tests/" } + }, + "suggest": { + "caseyamcl/guzzle_retry_middleware": "Allow automatic retry when request for sitemap fails", + "hamburgscleanest/guzzle-advanced-throttle": "Throttle requests", + "gmponos/guzzle_logger": "Advanced logging" } } diff --git a/src/SitemapParser.php b/src/SitemapParser.php index 5ca1906..dafaf56 100644 --- a/src/SitemapParser.php +++ b/src/SitemapParser.php @@ -97,6 +97,11 @@ class SitemapParser */ protected $currentURL; + /** + * @var \GuzzleHttp\Client + */ + protected $client; + /** * Constructor * @@ -104,7 +109,7 @@ class SitemapParser * @param array $config Configuration options * @throws Exceptions\SitemapParserException */ - public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = []) + public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = [], GuzzleHttp\Client $client = null) { mb_language("uni"); if (!mb_internal_encoding(self::ENCODING)) { @@ -112,6 +117,10 @@ public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config } $this->userAgent = $userAgent; $this->config = $config; + + if (!is_null($client)) { + $this->setClient($client); + } } /** @@ -237,7 +246,7 @@ protected function getContent() if (!isset($this->config['guzzle']['headers']['User-Agent'])) { $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent; } - $client = new GuzzleHttp\Client(); + $client = $this->getClient(); $res = $client->request('GET', $this->currentURL, $this->config['guzzle']); return $res->getBody()->getContents(); } catch (GuzzleHttp\Exception\TransferException $e) { @@ -506,4 +515,25 @@ public function getUserAgent() { public function setUserAgent(string $userAgent) { $this->userAgent = $userAgent; } + + /** + * @return \GuzzleHttp\Client + */ + protected function getClient() + { + if (empty($this->client)) { + $this->client = new \GuzzleHttp\Client(); + } + return $this->client; + } + + /** + * @param mixed $client + * @return $this + */ + public function setClient(\GuzzleHttp\Client $client) + { + $this->client = $client; + return $this; + } }