From c414ec31b7d76f5dee32779e5f5efaa5e2786625 Mon Sep 17 00:00:00 2001 From: Grzegorz Drozd <1885137+GrzegorzDrozd@users.noreply.github.com> Date: Mon, 27 Nov 2023 16:24:01 +0100 Subject: [PATCH] Change client creation to allow override (#24) * Add GuzzleHttp client to SitemapParser constructor as a parameter The SitemapParser constructor now accepts a GuzzleHttp client as a parameter, improving flexibility and testability * The README and composer.json have been updated to suggest middleware for automatic retries on failed requests, and for throttling requests to prevent rate limit issues. Detailed instructions for implementation of these middlewares have been added to the README file. * Add advanced logging in composer.json and README.md The commit includes the addition of the "gmponos/guzzle-log-middleware" library in the composer.json file and the detailed instructions to use it in the README.md. This addition would enhance the application's logging and debugging abilities. * Allow SitemapParser's constructor to accept null client * Set client only when it is provided. * Fix package name --------- Co-authored-by: Grzegorz Drozd --- README.md | 123 +++++++++++++++++++++++++++++++++++++++++- composer.json | 5 ++ src/SitemapParser.php | 34 +++++++++++- 3 files changed, 159 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4646425..f9e7ef0 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@ The [Sitemaps.org](http://www.sitemaps.org/) protocol is the leading standard an - Custom User-Agent string - Proxy support - URL blacklist +- request throttling (using https://github.com/hamburgscleanest/guzzle-advanced-throttle) +- retry (using https://github.com/caseyamcl/guzzle_retry_middleware) +- advanced logging (using https://github.com/gmponos/guzzle_logger) ## Formats supported - XML `.xml` @@ -33,7 +36,9 @@ The [Sitemaps.org](http://www.sitemaps.org/) protocol is the leading standard an - [mbstring](http://php.net/manual/en/book.mbstring.php) - [libxml](http://php.net/manual/en/book.libxml.php) _(enabled by default)_ - [SimpleXML](http://php.net/manual/en/book.simplexml.php) _(enabled by default)_ - +- Optional: + - https://github.com/caseyamcl/guzzle_retry_middleware + - https://github.com/hamburgscleanest/guzzle-advanced-throttle ## Installation The library is available for install via [Composer](https://getcomposer.org). Just add this to your `composer.json` file: ```json @@ -143,6 +148,122 @@ try { } ``` +### Throttling + +1. Install middleware: +```bash +composer require hamburgscleanest/guzzle-advanced-throttle +``` +2. Define host rules: + +```php +$rules = new RequestLimitRuleset([ + 'https://www.google.com' => [ + [ + 'max_requests' => 20, + 'request_interval' => 1 + ], + [ + 'max_requests' => 100, + 'request_interval' => 120 + ] + ] +]); +``` +3. Create handler stack: + +```php +$stack = new HandlerStack(); +$stack->setHandler(new CurlHandler()); +``` +4. Create middleware: +```php +$throttle = new ThrottleMiddleware($rules); + + // Invoke the middleware +$stack->push($throttle()); + +// OR: alternatively call the handle method directly +$stack->push($throttle->handle()); +``` +5. Create client manually: +```php +$client = new \GuzzleHttp\Client(['handler' => $stack]); +``` +6. Pass client as an argument or use `setClient` method: +```php +$parser = new SitemapParser(); +$parser->setClient($client); +``` +More details about this middle ware is available [here](https://github.com/hamburgscleanest/guzzle-advanced-throttle) + +### Automatic retry + +1. Install middleware: +```bash +composer require caseyamcl/guzzle_retry_middleware +``` + +2. Create stack: +```php +$stack = new HandlerStack(); +$stack->setHandler(new CurlHandler()); +``` + +3. Add middleware to the stack: +```php +$stack->push(GuzzleRetryMiddleware::factory()); +``` + +4. Create client manually: +```php +$client = new \GuzzleHttp\Client(['handler' => $stack]); +``` + +5. Pass client as an argument or use setClient method: +```php +$parser = new SitemapParser(); +$parser->setClient($client); +``` +More details about this middle ware is available [here](https://github.com/caseyamcl/guzzle_retry_middleware) + +### Advanced logging + +1. Install middleware: +```bash +composer require gmponos/guzzle_logger +``` + +2. Create PSR-3 style logger +```php +$logger = new Logger(); +``` + +3. Create handler stack: + +```php +$stack = new HandlerStack(); +$stack->setHandler(new CurlHandler()); +``` + +5. Push logger middleware to stack +```php +$stack->push(new LogMiddleware($logger)); +``` + +6. Create client manually: +```php +$client = new \GuzzleHttp\Client(['handler' => $stack]); +``` +7. Pass client as an argument or use `setClient` method: +```php +$parser = new SitemapParser(); +$parser->setClient($client); +``` +More details about this middleware config (like log levels, when to log and what to log) is available [here](https://github.com/gmponos/guzzle_logger) + + + ### Additional examples Even more examples available in the [examples](https://github.com/VIPnytt/SitemapParser/tree/master/examples) directory. diff --git a/composer.json b/composer.json index a654f1d..f4f15b3 100644 --- a/composer.json +++ b/composer.json @@ -43,5 +43,10 @@ "psr-4": { "vipnytt\\SitemapParser\\Tests\\": "tests/" } + }, + "suggest": { + "caseyamcl/guzzle_retry_middleware": "Allow automatic retry when request for sitemap fails", + "hamburgscleanest/guzzle-advanced-throttle": "Throttle requests", + "gmponos/guzzle_logger": "Advanced logging" } } diff --git a/src/SitemapParser.php b/src/SitemapParser.php index 5ca1906..dafaf56 100644 --- a/src/SitemapParser.php +++ b/src/SitemapParser.php @@ -97,6 +97,11 @@ class SitemapParser */ protected $currentURL; + /** + * @var \GuzzleHttp\Client + */ + protected $client; + /** * Constructor * @@ -104,7 +109,7 @@ class SitemapParser * @param array $config Configuration options * @throws Exceptions\SitemapParserException */ - public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = []) + public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = [], GuzzleHttp\Client $client = null) { mb_language("uni"); if (!mb_internal_encoding(self::ENCODING)) { @@ -112,6 +117,10 @@ public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config } $this->userAgent = $userAgent; $this->config = $config; + + if (!is_null($client)) { + $this->setClient($client); + } } /** @@ -237,7 +246,7 @@ protected function getContent() if (!isset($this->config['guzzle']['headers']['User-Agent'])) { $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent; } - $client = new GuzzleHttp\Client(); + $client = $this->getClient(); $res = $client->request('GET', $this->currentURL, $this->config['guzzle']); return $res->getBody()->getContents(); } catch (GuzzleHttp\Exception\TransferException $e) { @@ -506,4 +515,25 @@ public function getUserAgent() { public function setUserAgent(string $userAgent) { $this->userAgent = $userAgent; } + + /** + * @return \GuzzleHttp\Client + */ + protected function getClient() + { + if (empty($this->client)) { + $this->client = new \GuzzleHttp\Client(); + } + return $this->client; + } + + /** + * @param mixed $client + * @return $this + */ + public function setClient(\GuzzleHttp\Client $client) + { + $this->client = $client; + return $this; + } }