From 555dbce44cf946356bde6c79c487cf8c0f85257f Mon Sep 17 00:00:00 2001 From: Rodrigo Aramburu Date: Sun, 17 Dec 2023 11:42:46 -0300 Subject: [PATCH] consertado workflow adicionado withWebDriver no ScraPHPBuilder passado logs para classe scraphp adicionado builder para scraphp --- .github/workflows/{ci.yml => main.yml} | 14 +- README.md | 2 +- composer.json | 6 + .../docker-compose.yml => docker-compose.yml | 2 +- {tests/docker => docker}/php/Dockerfile | 0 src/HttpClient/AssetFetcher.php | 21 +- src/HttpClient/FilteredElement.php | 4 +- .../Guzzle/GuzzleFilteredElement.php | 7 +- src/HttpClient/Guzzle/GuzzleHttpClient.php | 28 +- src/HttpClient/Guzzle/GuzzlePage.php | 16 +- src/HttpClient/HttpClient.php | 3 - src/HttpClient/Page.php | 11 +- .../WebDriver/WebDriverFilteredElement.php | 18 +- .../WebDriver/WebDriverHttpClient.php | 59 ++-- src/HttpClient/WebDriver/WebDriverPage.php | 23 +- src/ProcessPage.php | 8 +- src/ScraPHP.php | 225 ++++++-------- src/ScraPHPBuilder.php | 162 ++++++++++ src/Writers/CSVWriter.php | 3 +- src/Writers/DatabaseWriter.php | 11 +- src/Writers/JsonWriter.php | 4 +- src/Writers/Writer.php | 30 +- tests/HttpClient/AssetFetcherTest.php | 31 +- .../Guzzle/GuzzleHttpClientTest.php | 17 +- tests/HttpClient/Guzzle/GuzzlePageTest.php | 12 +- .../WebDriver/WebDriverHttpClientTest.php | 55 ++-- .../WebDriver/WebDriverPageTest.php | 29 +- tests/ProcessPageTest.php | 37 ++- tests/ScrapPHPBuildTest.php | 81 +++++ tests/ScrapPHPTest.php | 277 +++++++++--------- tests/Writers/CSVWriterTest.php | 19 +- tests/Writers/DatabaseWriterTest.php | 5 - tests/Writers/JsonWriterTest.php | 5 - 33 files changed, 674 insertions(+), 551 deletions(-) rename .github/workflows/{ci.yml => main.yml} (63%) rename tests/docker-compose.yml => docker-compose.yml (92%) rename {tests/docker => docker}/php/Dockerfile (100%) create mode 100644 src/ScraPHPBuilder.php create mode 100644 tests/ScrapPHPBuildTest.php diff --git a/.github/workflows/ci.yml b/.github/workflows/main.yml similarity index 63% rename from .github/workflows/ci.yml rename to .github/workflows/main.yml index 42814c6..7c603a2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/main.yml @@ -20,12 +20,20 @@ jobs: - name: Download dependencies run: composer update --no-interaction --no-progress + + + - uses: isbang/compose-action@v1.5.1 + with: + compose-file: "./docker-compose.yml" + down-flags: "--volumes" + services: | + php + selenium - - name: Start test servers + - name: Sleep to wait for Selenium shell: bash run: | - cd tests/fixtures - php -S 0.0.0.0:8000 & + sleep 30 & - name: Run tests run: ./vendor/bin/pest \ No newline at end of file diff --git a/README.md b/README.md index 5faf575..cb31a48 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ - +![example workflow](https://github.com/rodrigoaramburu/scraphp/actions/workflows/ci.yml/badge.svg) # Executar Selenium diff --git a/composer.json b/composer.json index 421500f..2036d73 100644 --- a/composer.json +++ b/composer.json @@ -30,5 +30,11 @@ "allow-plugins": { "pestphp/pest-plugin": true } + }, + "scripts": { + "lint": ["pint"], + "test": ["pest"], + "test:lint": ["pint --test"], + "test:stan": ["phpstan analyse"] } } diff --git a/tests/docker-compose.yml b/docker-compose.yml similarity index 92% rename from tests/docker-compose.yml rename to docker-compose.yml index f925275..685ed2e 100644 --- a/tests/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,7 @@ services: ports: - 8000:8000 volumes: - - './test-pages:/application' + - './tests/test-pages:/application' selenium: image: selenium/standalone-chrome:latest diff --git a/tests/docker/php/Dockerfile b/docker/php/Dockerfile similarity index 100% rename from tests/docker/php/Dockerfile rename to docker/php/Dockerfile diff --git a/src/HttpClient/AssetFetcher.php b/src/HttpClient/AssetFetcher.php index 411c6d4..8ae7269 100644 --- a/src/HttpClient/AssetFetcher.php +++ b/src/HttpClient/AssetFetcher.php @@ -1,25 +1,23 @@ -client = new \GuzzleHttp\Client(); } - + /** * Fetches an asset from the given URL. * @@ -31,18 +29,15 @@ public function __construct( public function fetchAsset(string $url): string { try { - $this->logger->info('Fetching asset '.$url); $response = $this->client->request('GET', $url); - $this->logger->info('Status: '.$response->getStatusCode().' '.$url); } catch (ClientException $e) { if ($e->getCode() === 404) { - $this->logger->error('404 NOT FOUND '.$url); throw new AssetNotFoundException($url.' not found'); } - } catch(ConnectException $e) { + } catch (ConnectException $e) { throw new HttpClientException($e->getMessage(), $e->getCode(), $e); } return $response->getBody()->getContents(); } -} \ No newline at end of file +} diff --git a/src/HttpClient/FilteredElement.php b/src/HttpClient/FilteredElement.php index 4e9c297..649d961 100644 --- a/src/HttpClient/FilteredElement.php +++ b/src/HttpClient/FilteredElement.php @@ -1,4 +1,5 @@ -crawler->attr($attr); } - public function filterCSS(string $cssSelector): ?FilteredElement{ + public function filterCSS(string $cssSelector): ?FilteredElement + { $crawler = $this->crawler->filter($cssSelector); if ($crawler->count() === 0) { return null; @@ -46,7 +45,7 @@ public function filterCSS(string $cssSelector): ?FilteredElement{ } public function filterCSSEach(string $cssSelector, callable $callback): array - { + { $filter = $this->crawler->filter($cssSelector); return $filter->each(static function (Crawler $crawler, int $i) use ($callback) { diff --git a/src/HttpClient/Guzzle/GuzzleHttpClient.php b/src/HttpClient/Guzzle/GuzzleHttpClient.php index 4120f1b..7f304bd 100644 --- a/src/HttpClient/Guzzle/GuzzleHttpClient.php +++ b/src/HttpClient/Guzzle/GuzzleHttpClient.php @@ -4,33 +4,31 @@ namespace ScraPHP\HttpClient\Guzzle; -use Psr\Log\LoggerInterface; -use ScraPHP\HttpClient\Page; -use ScraPHP\HttpClient\HttpClient; -use ScraPHP\HttpClient\AssetFetcher; use GuzzleHttp\Exception\ClientException; use GuzzleHttp\Exception\ConnectException; +use Psr\Log\LoggerInterface; +use ScraPHP\Exceptions\AssetNotFoundException; use ScraPHP\Exceptions\HttpClientException; use ScraPHP\Exceptions\UrlNotFoundException; -use ScraPHP\Exceptions\AssetNotFoundException; +use ScraPHP\HttpClient\AssetFetcher; +use ScraPHP\HttpClient\HttpClient; +use ScraPHP\HttpClient\Page; final class GuzzleHttpClient implements HttpClient { private \GuzzleHttp\Client $client; + private AssetFetcher $assetFetcher; - + /** * Constructor for the class. * - * @param LoggerInterface $logger The logger instance. + * @param LoggerInterface $logger The logger instance. */ - public function __construct( - private LoggerInterface $logger, - - ) + public function __construct() { $this->client = new \GuzzleHttp\Client(); - $this->assetFetcher = new AssetFetcher($this->logger); + $this->assetFetcher = new AssetFetcher(); } /** @@ -45,15 +43,12 @@ public function __construct( public function get(string $url): Page { try { - $this->logger->info('Accessing '.$url); $response = $this->client->request('GET', $url); - $this->logger->info('Status: '.$response->getStatusCode().' '.$url); } catch (ClientException $e) { if ($e->getCode() === 404) { - $this->logger->error('404 NOT FOUND '.$url); throw new UrlNotFoundException($url.' not found'); } - } catch(ConnectException $e) { + } catch (ConnectException $e) { throw new HttpClientException($e->getMessage(), $e->getCode(), $e); } @@ -77,5 +72,4 @@ public function fetchAsset(string $url): string { return $this->assetFetcher->fetchAsset($url); } - } diff --git a/src/HttpClient/Guzzle/GuzzlePage.php b/src/HttpClient/Guzzle/GuzzlePage.php index 177c49b..5dddb21 100644 --- a/src/HttpClient/Guzzle/GuzzlePage.php +++ b/src/HttpClient/Guzzle/GuzzlePage.php @@ -1,11 +1,11 @@ -statusCode; } + public function url(): string { return $this->url; } + public function htmlBody(): string { return $this->content; } + public function headers(): array { return $this->headers; } + public function header(string $key): array { return $this->headers[$key] ?? []; @@ -50,14 +53,15 @@ public function filterCSS(string $cssSelector): ?FilteredElement return new GuzzleFilteredElement(crawler: $crawler); } + public function filterCSSEach(string $cssSelector, callable $callback): array { $crawler = new Crawler($this->content); - + $filter = $crawler->filter($cssSelector); return $filter->each(static function (Crawler $crawler, int $i) use ($callback) { return $callback(new GuzzleFilteredElement(crawler: $crawler), $i); }); } -} \ No newline at end of file +} diff --git a/src/HttpClient/HttpClient.php b/src/HttpClient/HttpClient.php index c47438d..15eb6ca 100644 --- a/src/HttpClient/HttpClient.php +++ b/src/HttpClient/HttpClient.php @@ -4,12 +4,9 @@ namespace ScraPHP\HttpClient; -use ScraPHP\HttpClient\Page; - interface HttpClient { public function get(string $url): Page; public function fetchAsset(string $url): string; - } diff --git a/src/HttpClient/Page.php b/src/HttpClient/Page.php index 58a6f50..cb7920d 100644 --- a/src/HttpClient/Page.php +++ b/src/HttpClient/Page.php @@ -1,19 +1,22 @@ -remoteWebElement->findElement( WebDriverBy::cssSelector($cssSelector) ); - }catch (NoSuchElementException $exception){ + } catch (NoSuchElementException $exception) { return null; } + return new WebDriverFilteredElement( remoteWebElement: $remoteWebElement ); @@ -48,6 +49,7 @@ public function filterCSSEach(string $cssSelector, callable $callback): array foreach ($elements as $key => $element) { $data[] = $callback(new WebDriverFilteredElement(remoteWebElement: $element), $key); } + return $data; } -} \ No newline at end of file +} diff --git a/src/HttpClient/WebDriver/WebDriverHttpClient.php b/src/HttpClient/WebDriver/WebDriverHttpClient.php index a674a41..4909f33 100644 --- a/src/HttpClient/WebDriver/WebDriverHttpClient.php +++ b/src/HttpClient/WebDriver/WebDriverHttpClient.php @@ -1,39 +1,36 @@ -addArguments(['-headless']); @@ -41,10 +38,9 @@ public function __construct( $desiredCapabilities->setCapability(ChromeOptions::CAPABILITY, $chromeOptions); $this->webDriver = RemoteWebDriver::create($webDriverUrl, $desiredCapabilities); - - $this->assetFetcher = new AssetFetcher($this->logger); - } + $this->assetFetcher = new AssetFetcher(); + } /** * Destructor method for the class. @@ -61,33 +57,30 @@ public function __destruct() /** * Retrieves a web page using the specified URL and returns a Page object. * - * @param string $url The URL of the web page to retrieve. + * @param string $url The URL of the web page to retrieve. + * @return Page The Page object representing the retrieved web page. + * * @throws HttpClient An exception that is thrown when an error occurs while accessing the URL. * @throws UrlNotFoundException An exception that is thrown when the web page is not found (404 error). - * @return Page The Page object representing the retrieved web page. */ public function get(string $url): Page { - $this->logger->info('Accessing ' . $url); - try{ + try { $this->webDriver->get($url); - }catch(Exception $e){ + } catch (Exception $e) { throw new HttpClientException($e->getMessage(), $e->getCode(), $e); } - try{ + try { $title = $this->webDriver->findElement(WebDriverBy::cssSelector('h1'))->getText(); - if(str_contains( $title , 'Not Found') ){ - $this->logger->error('404 NOT FOUND ' . $url); + if (str_contains($title, 'Not Found')) { throw new UrlNotFoundException($url); } - }catch(NoSuchElementException $e){ + } catch (NoSuchElementException $e) { // ok não é uma página de erro } - $this->logger->info('Status: ' . 200 . ' ' . $url); - return new WebDriverPage( webDriver: $this->webDriver, statusCode: 200, @@ -105,12 +98,6 @@ public function get(string $url): Page */ public function fetchAsset(string $url): string { - return $this->assetFetcher->fetchAsset($url); + return $this->assetFetcher->fetchAsset($url); } - - public function withLogger(): LoggerInterface - { - return $this->logger; - } - } diff --git a/src/HttpClient/WebDriver/WebDriverPage.php b/src/HttpClient/WebDriver/WebDriverPage.php index 45a5fe6..64e7e6e 100644 --- a/src/HttpClient/WebDriver/WebDriverPage.php +++ b/src/HttpClient/WebDriver/WebDriverPage.php @@ -1,25 +1,24 @@ -statusCode; @@ -50,20 +49,21 @@ public function webDriver(): RemoteWebDriver return $this->webDriver; } - public function filterCSS(string $cssSelector): ?FilteredElement { - try{ + try { $remoteWebElement = $this->webDriver->findElement( WebDriverBy::cssSelector($cssSelector) ); - }catch (NoSuchElementException $exception){ + } catch (NoSuchElementException $exception) { return null; } + return new WebDriverFilteredElement( remoteWebElement: $remoteWebElement ); } + public function filterCSSEach(string $cssSelector, callable $callback): array { $elements = $this->webDriver->findElements(WebDriverBy::cssSelector($cssSelector)); @@ -72,6 +72,7 @@ public function filterCSSEach(string $cssSelector, callable $callback): array foreach ($elements as $key => $element) { $data[] = $callback(new WebDriverFilteredElement(remoteWebElement: $element), $key); } + return $data; } } diff --git a/src/ProcessPage.php b/src/ProcessPage.php index 5c93043..e3b333a 100644 --- a/src/ProcessPage.php +++ b/src/ProcessPage.php @@ -15,20 +15,20 @@ abstract public function process(Page $page): void; /** * Set the ScraPHP instance. * - * @param ScraPHP $scraphp The ScraPHP instance to set. - * @return self + * @param ScraPHP $scraphp The ScraPHP instance to set. */ public function withScraPHP(ScraPHP $scraphp): self { $this->scraphp = $scraphp; + return $this; } /** * Calls a method on the 'scraphp' object dynamically. * - * @param string $name The name of the method to call. - * @param array $arguments The arguments to pass to the method. + * @param string $name The name of the method to call. + * @param array $arguments The arguments to pass to the method. */ public function __call($name, $arguments) { diff --git a/src/ScraPHP.php b/src/ScraPHP.php index e4302a7..2c80e76 100644 --- a/src/ScraPHP.php +++ b/src/ScraPHP.php @@ -5,61 +5,42 @@ namespace ScraPHP; use Closure; -use Monolog\Level; use Monolog\Logger; -use ScraPHP\Writers\Writer; use Psr\Log\LoggerInterface; -use ScraPHP\HttpClient\Page; -use Monolog\Handler\StreamHandler; -use Scraphp\HttpClient\HttpClient; -use Monolog\Formatter\LineFormatter; +use ScraPHP\Exceptions\AssetNotFoundException; use ScraPHP\Exceptions\HttpClientException; use ScraPHP\Exceptions\UrlNotFoundException; -use ScraPHP\Exceptions\AssetNotFoundException; -use ScraPHP\HttpClient\Guzzle\GuzzleHttpClient; +use ScraPHP\HttpClient\HttpClient; +use ScraPHP\HttpClient\Page; +use ScraPHP\Writers\Writer; final class ScraPHP { - private HttpClient $httpClient; - - private LoggerInterface $logger; - - private Writer $writer; - private array $urlErrors = []; - private array $assetErrors = []; - private array $config; + private array $assetErrors = []; /** * Constructs a new instance of the class. * - * @param array> $config An array of configuration options. - * - 'logger': (array) An array of configuration options for the logger. - * - 'filename': (string) The filename of the log file. Defaults to 'php://stdout'. - * - 'httpclient': (array) An array of configuration options for the HTTP client. - * - 'retry_count': (int) The number of times to retry a failed request. Defaults to 3. - * - 'retry_time': (int) The number of seconds to wait between retries. Defaults to 30. - * - * @throws Exception If an error occurs during initialization. + * @param HttpClient $httpClient The HTTP client to use. + * @param LoggerInterface $logger The logger to use. + * @param Writer $writer The writer to use. + * @param int $retryCount The number of times to retry. + * @param int $retryTime The time to wait between retries. */ - public function __construct(array $config = []) - { - - $config['logger']['filename'] = $config['logger']['filename'] ?? 'php://stdout'; - - $config['httpclient']['retry_count'] = $config['httpclient']['retry_count'] ?? 3; - $config['httpclient']['retry_time'] = $config['httpclient']['retry_time'] ?? 30; - - $this->config = $config; - - $this->initLogger($config['logger']['filename']); - - $this->httpClient = new GuzzleHttpClient($this->logger); + public function __construct( + private HttpClient $httpClient, + private LoggerInterface $logger, + private Writer $writer, + private int $retryCount = 3, + private int $retryTime = 30 + ) { } /** - * Executes a GET request to the specified URL and invokes the provided callback function with the page object. + * Executes a GET request to the specified URL and invokes the provided callback + * function with the page object. * * @param string $url The URL to send the GET request to. * @param callable|ProcessPage $callback The callback function or class ProcessPage to invoke with the response body. @@ -80,91 +61,46 @@ public function go(string $url, Closure|ProcessPage $callback): self $callback->withScraPHP($this); $callback->process($page); } - } catch(HttpClientException $e) { - $this->urlErrors[] = [ 'url' => $url, 'pageProcessor' => $callback]; + } catch (HttpClientException|UrlNotFoundException $e) { + $this->urlErrors[] = ['url' => $url, 'pageProcessor' => $callback]; $this->logger->error('cant get url: '.$url); } - return $this; } - /** * Tries to get a page from the given URL. * - * @param string $url The URL of the page to retrieve. - * @throws HttpClientException If an error occurs while making the HTTP request. + * @param string $url The URL of the page to retrieve. * @return Page The retrieved page. + * + * @throws HttpClientException If an error occurs while making the HTTP request. */ - private function tryGetPage(string $url): Page + private function tryGetPage(string $url): ?Page { $tries = 0; - while($tries < $this->config['httpclient']['retry_count']) { + while ($tries < $this->retryCount) { try { - return $this->httpClient->get($url); - } catch(HttpClientException $e) { - $tries++; + $this->logger->info('Accessing '.$url); + $page = $this->httpClient->get($url); + $this->logger->info('Status: '.$page->statusCode().' '.$url); + + return $page; + } catch (UrlNotFoundException $e) { + $this->logger->error('404 NOT FOUND '.$url); + } catch (HttpClientException $e) { $this->logger->error('Error: '.$e->getMessage()); - if($tries >= $this->config['httpclient']['retry_count']) { - throw $e; - } - $this->logger->info('Retry in ('.($this->config['httpclient']['retry_time'] * $tries).') seconds: '.$url); - sleep($this->config['httpclient']['retry_time'] * $tries); } - + $tries++; + if ($tries >= $this->retryCount) { + throw $e; + } + $this->logger->info('Retry in ('.($this->retryTime * $tries).') seconds: '.$url); + sleep($this->retryTime * $tries); } - } - - /** - * Sets the HTTP client for the object and returns the modified object. - * - * @param HttpClientInterface $httpClient The HTTP client to be set. - * @return self The modified object. - */ - public function withHttpClient(HttpClient $httpClient): self - { - $this->httpClient = $httpClient; - $httpClient->withLogger($this->logger); - - return $this; - } - - /** - * Sets the writer for the object and returns the object itself. - * - * @param Writer $writer The writer object to set. - * @return self The updated object with the new writer. - */ - public function withWriter(Writer $writer): self - { - $this->writer = $writer; - $this->writer->withLogger($this->logger); - - return $this; - } - - /** - * Sets a logger for the current object and returns the object itself. - * - * @param LoggerInterface $logger The logger to be set. - * @return self The modified object. - */ - public function withLogger(LoggerInterface $logger): self - { - $this->logger = $logger; - return $this; - } - - /** - * Gets the writer object. - * - * @return Writer The writer object. - */ - public function writer(): Writer - { - return $this->writer; + return null; } /** @@ -179,10 +115,11 @@ public function fetchAsset(string $url): ?string { try { return $this->tryGetAsset($url); - } catch(HttpClientException $e) { - $this->assetErrors[] = [ 'url' => $url]; + } catch (HttpClientException $e) { + $this->assetErrors[] = ['url' => $url]; $this->logger->error('cant get asset: '.$url); } + return null; } @@ -196,7 +133,7 @@ public function fetchAsset(string $url): ?string * * @throws AssetNotFoundException If the asset could not be found. */ - public function saveAsset(string $url, string $path, ?string $filename = null): ?string + public function saveAsset(string $url, string $path, string $filename = null): ?string { try { @@ -206,41 +143,49 @@ public function saveAsset(string $url, string $path, ?string $filename = null): } file_put_contents($path.$filename, $content); - return $path . $filename; + return $path.$filename; - } catch(HttpClientException $e) { - $this->assetErrors[] = [ 'url' => $url]; + } catch (HttpClientException $e) { + $this->assetErrors[] = ['url' => $url]; $this->logger->error('cant get asset: '.$url); } return null; } - /** * Tries to get an asset from a given URL. * - * @param string $url The URL of the asset. - * @throws HttpClientException If an error occurs during the HTTP request. + * @param string $url The URL of the asset. * @return string The fetched asset. + * + * @throws HttpClientException If an error occurs during the HTTP request. */ - private function tryGetAsset(string $url): string + private function tryGetAsset(string $url): ?string { $tries = 0; - while($tries < $this->config['httpclient']['retry_count']) { + while ($tries < $this->retryCount) { try { - return $this->httpClient->fetchAsset($url); - } catch(HttpClientException $e) { + $this->logger->info('Fetching asset: '.$url); + $data = $this->httpClient->fetchAsset($url); + $this->logger->info('Fetched: '.$url); + + return $data; + } catch (AssetNotFoundException $e) { + $this->logger->error('404 NOT FOUND '.$url); + } catch (HttpClientException $e) { $tries++; $this->logger->error('Error: '.$e->getMessage()); - if($tries >= $this->config['httpclient']['retry_count']) { + if ($tries >= $this->retryCount) { throw $e; } - $this->logger->info('Retry in ('.($this->config['httpclient']['retry_time'] * $tries).') seconds: '.$url); - sleep($this->config['httpclient']['retry_time'] * $tries); + $this->logger->info('Retry in ('.($this->retryTime * $tries).') seconds: '.$url); + sleep($this->retryTime * $tries); } } + + return null; } /** @@ -264,19 +209,33 @@ public function logger(): Logger } /** - * Initializes the logger. + * Gets the writer object. + * + * @return Writer The writer object. + */ + public function writer(): Writer + { + return $this->writer; + } + + /** + * Gets the current retry count. * - * @param string $logfile The path to the log file. + * @return int The current retry count. + */ + public function retryCount(): int + { + return $this->retryCount; + } + + /** + * Get the retry time. * - * @throws Exception If there is an error initializing the logger. + * @return int The retry time. */ - private function initLogger(string $logfile): void + public function retryTime(): int { - $this->logger = new Logger('SCRAPHP'); - $handler = new StreamHandler($logfile, Level::Debug); - $formatter = new LineFormatter("%datetime% %level_name% %message% %context% %extra%\n", 'Y-m-d H:i:s'); - $handler->setFormatter($formatter); - $this->logger->pushHandler($handler); + return $this->retryTime; } /** @@ -288,6 +247,7 @@ public function urlErrors(): array { return $this->urlErrors; } + /** * Gets the list of asset errors. * @@ -297,4 +257,9 @@ public function assetErrors(): array { return $this->assetErrors; } + + public static function build(): ScraPHPBuilder + { + return new ScraPHPBuilder(); + } } diff --git a/src/ScraPHPBuilder.php b/src/ScraPHPBuilder.php new file mode 100644 index 0000000..ffbb167 --- /dev/null +++ b/src/ScraPHPBuilder.php @@ -0,0 +1,162 @@ +httpClient = $httpClient; + + return $this; + } + + /** + * Sets the Logger and returns itself. If a string was passed in, it + * will be create a Logger to this file. + * + * @param LoggerInterface|string $logger The logger to be set for the object. + * @return self Returns the current object instance. + */ + public function withLogger(LoggerInterface|string $logger): self + { + if (is_string($logger)) { + $this->logger = $this->createDefaultLogger($logger); + + return $this; + } + + $this->logger = $logger; + + return $this; + } + + /** + * Sets the writer for the object. + * + * @param Writer $writer The writer object to be set. + * @return self The modified object with the new writer. + */ + public function withWriter(Writer $writer): self + { + $this->writer = $writer; + + return $this; + } + + /** + * Sets the retry count. + * + * @param int $retryCount The number of times the function should be retried. + */ + public function withRetryCount(int $retryCount): self + { + $this->retryCount = $retryCount; + + return $this; + } + + /** + * Sets the retry time. + * + * @param int $retryTime The retry time in milliseconds. + * @return self The current instance of the class. + */ + public function withRetryTime(int $retryTime): self + { + $this->retryTime = $retryTime; + + return $this; + } + + /** + * Create a web driver client for the ScraPHP class. + * + * @param string $url The URL of the WebDriver. Default is 'http://localhost:4444'. + * + * @return self The current instance of this class. + */ + public function withWebDriver(string $url = 'http://localhost:4444'): self + { + $this->httpClient = new WebDriverHttpClient($url); + return $this; + } + + /** + * Create a new instance of the ScraPHP class. + */ + public function create(): ScraPHP + { + + $logger = $this->logger === null + ? $this->createDefaultLogger('php://stdout') + : $this->logger; + + $writer = $this->writer === null + ? new JsonWriter('out.json') + : $this->writer; + + $httpClient = $this->httpClient === null + ? new GuzzleHttpClient() + : $this->httpClient; + + return new ScraPHP( + httpClient: $httpClient, + logger: $logger, + writer: $writer, + retryCount: $this->retryCount, + retryTime: $this->retryTime + ); + } + + /** + * Initializes the logger. + * + * @param string $logfile The path to the log file. + * @return LoggerInterface The initialized logger. + * + * @throws Exception If there is an error initializing the logger. + */ + private function createDefaultLogger(string $logfile): LoggerInterface + { + $logger = new Logger('SCRAPHP'); + $handler = new StreamHandler($logfile, Level::Debug); + $formatter = new LineFormatter( + "%datetime% %level_name% %message% %context% %extra%\n", + 'Y-m-d H:i:s' + ); + $handler->setFormatter($formatter); + $logger->pushHandler($handler); + + return $logger; + } +} diff --git a/src/Writers/CSVWriter.php b/src/Writers/CSVWriter.php index f18ad4a..7efc500 100644 --- a/src/Writers/CSVWriter.php +++ b/src/Writers/CSVWriter.php @@ -6,7 +6,7 @@ use Exception; -final class CSVWriter extends Writer +final class CSVWriter implements Writer { private mixed $file; @@ -52,7 +52,6 @@ public function write(array $data): void } fwrite($this->file, "\n".implode($this->separator, $orderedData)); - $this->logger()->info('Saved data: '.json_encode($orderedData)); } /** diff --git a/src/Writers/DatabaseWriter.php b/src/Writers/DatabaseWriter.php index 426fe45..b52b218 100644 --- a/src/Writers/DatabaseWriter.php +++ b/src/Writers/DatabaseWriter.php @@ -4,13 +4,13 @@ namespace ScraPHP\Writers; -final class DatabaseWriter extends Writer +final class DatabaseWriter implements Writer { /** * Constructs a new instance of the class. * - * @param \PDO $pdo The PDO object. - * @param string $table The name of the table. + * @param \PDO $pdo The PDO object. + * @param string $table The name of the table. */ public function __construct( private \PDO $pdo, @@ -21,7 +21,7 @@ public function __construct( /** * Writes data to the database. * - * @param array $data The data to write. + * @param array $data The data to write. */ public function write(array $data): void { @@ -37,13 +37,12 @@ public function write(array $data): void } $stmt->execute(); - $this->logger()->info('Saved data: '.json_encode($data)); } /** * Checks if a record exists in the database based on the given search criteria. * - * @param array $search The search criteria to use for the query. + * @param array $search The search criteria to use for the query. * @return bool Returns true if a record exists, false otherwise. */ public function exists(array $search): bool diff --git a/src/Writers/JsonWriter.php b/src/Writers/JsonWriter.php index 523da58..62b139d 100644 --- a/src/Writers/JsonWriter.php +++ b/src/Writers/JsonWriter.php @@ -4,7 +4,7 @@ namespace ScraPHP\Writers; -final class JsonWriter extends Writer +final class JsonWriter implements Writer { private string $filename; @@ -33,8 +33,6 @@ public function write(array $data): void $jsonData = json_decode($json, true); $jsonData[] = $data; file_put_contents($this->filename, json_encode($jsonData, JSON_PRETTY_PRINT)); - - $this->logger()->info('Saved data: '.json_encode($data)); } /** diff --git a/src/Writers/Writer.php b/src/Writers/Writer.php index ad3372b..8f7f30c 100644 --- a/src/Writers/Writer.php +++ b/src/Writers/Writer.php @@ -4,33 +4,9 @@ namespace ScraPHP\Writers; -use Psr\Log\LoggerInterface; - -abstract class Writer +interface Writer { - private LoggerInterface $logger; - - abstract public function write(array $data): void; - - abstract public function exists(array $search): bool; - - /** - * Sets the logger for the class. - * - * @param LoggerInterface $logger The logger to be set. - */ - public function withLogger(LoggerInterface $logger): void - { - $this->logger = $logger; - } + public function write(array $data): void; - /** - * Gets the logger instance. - * - * @return LoggerInterface The logger instance. - */ - public function logger(): LoggerInterface - { - return $this->logger; - } + public function exists(array $search): bool; } diff --git a/tests/HttpClient/AssetFetcherTest.php b/tests/HttpClient/AssetFetcherTest.php index 1c118ca..ed2653c 100644 --- a/tests/HttpClient/AssetFetcherTest.php +++ b/tests/HttpClient/AssetFetcherTest.php @@ -1,43 +1,24 @@ -logger = Mockery::mock(LoggerInterface::class); - $this->assetFetcher = new AssetFetcher($this->logger); - +beforeEach(function () { + $this->assetFetcher = new AssetFetcher(); }); test('fetch an asset', function () { - - $this->logger->shouldReceive('info')->with('Fetching asset http://localhost:8000/asset-test.txt'); - $this->logger->shouldReceive('info')->with('Status: 200 http://localhost:8000/asset-test.txt'); - $content = $this->assetFetcher->fetchAsset('http://localhost:8000/asset-test.txt'); - expect($content)->toBe('Asset Test'); }); - test('throw exception if asset not found', function () { - - $this->logger->shouldReceive('info')->with('Fetching asset http://localhost:8000/not-found.txt'); - - $this->logger->shouldReceive('error')->with('404 NOT FOUND http://localhost:8000/not-found.txt'); - $this->assetFetcher->fetchAsset('http://localhost:8000/not-found.txt'); - })->throws(AssetNotFoundException::class); - test('throw exception if http client error on fetchAsset', function () { - $this->logger->shouldReceive('info')->with('Fetching asset http://scraphp.com.br:8321/not-found.jpg'); - $this->assetFetcher->fetchAsset('http://scraphp.com.br:8321/not-found.jpg'); -})->throws(HttpClientException::class); \ No newline at end of file +})->throws(HttpClientException::class); diff --git a/tests/HttpClient/Guzzle/GuzzleHttpClientTest.php b/tests/HttpClient/Guzzle/GuzzleHttpClientTest.php index e658aff..99850aa 100644 --- a/tests/HttpClient/Guzzle/GuzzleHttpClientTest.php +++ b/tests/HttpClient/Guzzle/GuzzleHttpClientTest.php @@ -2,12 +2,11 @@ declare(strict_types=1); -use ScraPHP\HttpClient\Page; use Psr\Log\LoggerInterface; use ScraPHP\Exceptions\HttpClientException; use ScraPHP\Exceptions\UrlNotFoundException; -use ScraPHP\Exceptions\AssetNotFoundException; use ScraPHP\HttpClient\Guzzle\GuzzleHttpClient; +use ScraPHP\HttpClient\Page; beforeEach(function () { @@ -18,8 +17,8 @@ test('retrive a webpage and return an object page', function () { - $this->logger->shouldReceive('info')->with('Accessing http://localhost:8000/hello-world.php'); - $this->logger->shouldReceive('info')->with('Status: 200 http://localhost:8000/hello-world.php'); + //$this->logger->shouldReceive('info')->with('Accessing http://localhost:8000/hello-world.php'); + //$this->logger->shouldReceive('info')->with('Status: 200 http://localhost:8000/hello-world.php'); $page = $this->guzzleClient->get('http://localhost:8000/hello-world.php'); @@ -34,26 +33,20 @@ test('fetch an asset', function () { - $this->logger->shouldReceive('info')->with('Fetching asset http://localhost:8000/asset-test.txt'); - $this->logger->shouldReceive('info')->with('Status: 200 http://localhost:8000/asset-test.txt'); - $content = $this->guzzleClient->fetchAsset('http://localhost:8000/asset-test.txt'); expect($content)->toBe('Asset Test'); }); - test('throw exception if url not found', function () { - $this->logger->shouldReceive('info')->with('Accessing http://localhost:8000/not-found.php'); - - $this->logger->shouldReceive('error')->with('404 NOT FOUND http://localhost:8000/not-found.php'); + //$this->logger->shouldReceive('info')->with('Accessing http://localhost:8000/not-found.php'); + //$this->logger->shouldReceive('error')->with('404 NOT FOUND http://localhost:8000/not-found.php'); $this->guzzleClient->get('http://localhost:8000/not-found.php'); })->throws(UrlNotFoundException::class); - test('throw exception if http client error', function () { $this->logger->shouldReceive('info')->with('Accessing http://scraphp.com.br:8321/not-found.php'); diff --git a/tests/HttpClient/Guzzle/GuzzlePageTest.php b/tests/HttpClient/Guzzle/GuzzlePageTest.php index f454483..f5953bd 100644 --- a/tests/HttpClient/Guzzle/GuzzlePageTest.php +++ b/tests/HttpClient/Guzzle/GuzzlePageTest.php @@ -1,11 +1,10 @@ -toBeInstanceOf(GuzzlePage::class) ->url()->toBe('http://localhost:8000/hello-world.php') ->statusCode()->toBe(200) - ->htmlBody()->toContain('Página Teste','

Hello World

') + ->htmlBody()->toContain('Página Teste', '

Hello World

') ->headers()->toBeArray() ->headers()->toBe([]); }); - test('filter elements by tag name', function () { $page = new GuzzlePage( @@ -52,7 +50,6 @@ expect($text)->toBe('Lorem ipsum dolor sit amet consectetur.'); }); - test('get attribute from element', function () { $page = new GuzzlePage( url: 'http://localhost:8000/seletors.html', @@ -66,7 +63,6 @@ expect($attr)->toBe('https://www.google.com'); }); - test('iterate filtered elements', function () { $page = new GuzzlePage( url: 'http://localhost:8000/seletors.html', @@ -95,7 +91,6 @@ expect($text)->toBe('Item 1'); }); - test('chain css filterCSS with filterCSSEach ', function () { $page = new GuzzlePage( url: 'http://localhost:8000/seletors.html', @@ -126,7 +121,6 @@ expect($result)->toBe(['Anderson', 'Carlos', 'Rafael']); }); - test('return null when filter element not found', function () { $page = new GuzzlePage( url: 'http://localhost:8000/seletors.html', @@ -151,4 +145,4 @@ $result = $page->filterCSS('.ul .not-found'); expect($result)->toBeNull(); -}); \ No newline at end of file +}); diff --git a/tests/HttpClient/WebDriver/WebDriverHttpClientTest.php b/tests/HttpClient/WebDriver/WebDriverHttpClientTest.php index 86ef015..9dcf6a9 100644 --- a/tests/HttpClient/WebDriver/WebDriverHttpClientTest.php +++ b/tests/HttpClient/WebDriver/WebDriverHttpClientTest.php @@ -1,87 +1,68 @@ -logger = Mockery::mock(LoggerInterface::class); - $this->webDriverClient = new WebDriverHttpClient($this->logger); + $this->webDriverClient = new WebDriverHttpClient(); }); -afterEach(function(){ +afterEach(function () { $this->webDriverClient->__destruct(); }); -test('retrive a webpage and return an object page', function(){ +test('retrive a webpage and return an object page', function () { - $this->logger->shouldReceive('info')->once()->with('Accessing http://localhost:8000/hello-world.php'); - $this->logger->shouldReceive('info')->once()->with('Status: 200 http://localhost:8000/hello-world.php'); + $page = $this->webDriverClient->get('http://localhost:8000/hello-world.php'); - $page = $this->webDriverClient->get('http://localhost:8000/hello-world.php'); - expect($page) ->toBeInstanceOf(WebDriverPage::class) ->htmlBody() - ->toContain('Página Teste','

Hello World

') + ->toContain('Página Teste', '

Hello World

') ->statusCode() - ->toBe(200) + ->toBe(200) ->url() - ->toBe('http://localhost:8000/hello-world.php'); + ->toBe('http://localhost:8000/hello-world.php'); }); -test('retrive a webpage and return an object page without h1', function(){ +test('retrive a webpage and return an object page without h1', function () { - $this->logger->shouldReceive('info')->once()->with('Accessing http://localhost:8000/paragraph.html'); - $this->logger->shouldReceive('info')->once()->with('Status: 200 http://localhost:8000/paragraph.html'); + $page = $this->webDriverClient->get('http://localhost:8000/paragraph.html'); - $page = $this->webDriverClient->get('http://localhost:8000/paragraph.html'); - expect($page) ->toBeInstanceOf(WebDriverPage::class) ->htmlBody() - ->toContain('

Lorem ipsum dolor sit amet consectetur.

') + ->toContain('

Lorem ipsum dolor sit amet consectetur.

') ->statusCode() - ->toBe(200) + ->toBe(200) ->url() - ->toBe('http://localhost:8000/paragraph.html'); + ->toBe('http://localhost:8000/paragraph.html'); }); - test('fetch an asset', function () { - $this->logger = Mockery::mock(LoggerInterface::class); - $this->webDriverClient = new WebDriverHttpClient($this->logger); - - $this->logger->shouldReceive('info')->with('Fetching asset http://localhost:8000/asset-test.txt'); - $this->logger->shouldReceive('info')->with('Status: 200 http://localhost:8000/asset-test.txt'); + $this->webDriverClient = new WebDriverHttpClient(); $content = $this->webDriverClient->fetchAsset('http://localhost:8000/asset-test.txt'); expect($content)->toBe('Asset Test'); }); - test('throw exception if url not found', function () { - - $this->logger->shouldReceive('info')->with('Accessing http://localhost:8000/not-found.php'); - - $this->logger->shouldReceive('error')->with('404 NOT FOUND http://localhost:8000/not-found.php'); - $this->webDriverClient->get('http://localhost:8000/not-found.php'); })->throws(UrlNotFoundException::class); - test('throw exception if http client error', function () { - $this->logger->shouldReceive('info')->with('Accessing asdf'); $this->webDriverClient->get('asdf'); }) -->throws(HttpClientException::class); \ No newline at end of file + ->throws(HttpClientException::class); diff --git a/tests/HttpClient/WebDriver/WebDriverPageTest.php b/tests/HttpClient/WebDriver/WebDriverPageTest.php index afebd68..dbf5d52 100644 --- a/tests/HttpClient/WebDriver/WebDriverPageTest.php +++ b/tests/HttpClient/WebDriver/WebDriverPageTest.php @@ -1,15 +1,14 @@ -addArguments(['-headless']); @@ -20,31 +19,30 @@ }); -afterEach(function(){ +afterEach(function () { $this->webDriver->quit(); }); test('have attributes', function () { $this->webDriver->get('http://localhost:8000/hello-world.php'); - + $page = new WebDriverPage( webDriver: $this->webDriver, statusCode: 200, - headers: ['Content-Type'=>['text/html; charset=UTF-8']], + headers: ['Content-Type' => ['text/html; charset=UTF-8']], ); expect($page)->toBeInstanceOf(WebDriverPage::class) ->url()->toBe('http://localhost:8000/hello-world.php') ->statusCode()->toBe(200) - ->htmlBody()->toContain('Página Teste','

Hello World

') + ->htmlBody()->toContain('Página Teste', '

Hello World

') ->headers()->toBeArray() - ->headers()->toBe(['Content-Type'=>['text/html; charset=UTF-8']]) + ->headers()->toBe(['Content-Type' => ['text/html; charset=UTF-8']]) ->header('Content-Type')->toBe(['text/html; charset=UTF-8']) ->webDriver()->toBe($this->webDriver); }); - test('filter elements by tag name', function () { $this->webDriver->get('http://localhost:8000/seletors.html'); @@ -60,7 +58,6 @@ expect($text)->toBe('Teste Seletores Titulo'); }); - test('filter elements by class', function () { $this->webDriver->get('http://localhost:8000/seletors.html'); @@ -75,7 +72,6 @@ expect($text)->toBe('Lorem ipsum dolor sit amet consectetur.'); }); - test('get attribute from element', function () { $this->webDriver->get('http://localhost:8000/seletors.html'); $page = new WebDriverPage( @@ -104,7 +100,6 @@ expect($result)->toBe(['Item 1 - 0', 'Item 2 - 1', 'Item 3 - 2']); }); - test('chain filterCSS', function () { $this->webDriver->get('http://localhost:8000/seletors.html'); $page = new WebDriverPage( @@ -161,7 +156,6 @@ expect($result)->toBe('Lorem ipsum dolor sit amet consectetur.'); }); - test('return null if element not found', function () { $this->webDriver->get('http://localhost:8000/paragraph.html'); @@ -190,7 +184,6 @@ expect($result)->toBeNull(); }); - test('return empty array iterating on not found filtered elements', function () { $this->webDriver->get('http://localhost:8000/seletors.html'); $page = new WebDriverPage( @@ -202,6 +195,6 @@ $result = $page->filterCSSEach('ul .not-found', function (FilteredElement $element, int $i) { return $element->text().' - '.$i; }); - + expect($result)->toBe([]); -}); \ No newline at end of file +}); diff --git a/tests/ProcessPageTest.php b/tests/ProcessPageTest.php index b5737f3..f7e384e 100644 --- a/tests/ProcessPageTest.php +++ b/tests/ProcessPageTest.php @@ -2,28 +2,47 @@ declare(strict_types=1); +use Psr\Log\LoggerInterface; +use ScraPHP\HttpClient\HttpClient; use ScraPHP\HttpClient\Page; -use ScraPHP\ScraPHP; use ScraPHP\ProcessPage; -use ScraPHP\HttpClient\HttpClient; +use ScraPHP\ScraPHP; +use ScraPHP\Writers\Writer; test('bind scraphp methods to instance', function () { - $pp = new class () extends ProcessPage { + $pp = new class() extends ProcessPage + { public function process(Page $page): void { } }; - $scraphp = new ScraPHP(); $httpClient = Mockery::mock(HttpClient::class); - - $httpClient->shouldReceive('withLogger')->once(); + $logger = Mockery::mock(LoggerInterface::class); + $scraphp = new ScraPHP( + httpClient: $httpClient, + logger: $logger, + writer: Mockery::mock(Writer::class), + ); + + $page = Mockery::mock(Page::class); + $page + ->shouldReceive('statusCode') + ->andReturn(200); + $httpClient->shouldReceive('get') ->with('http://localhost:8000/hello-world.php') ->once() - ->andReturn(Mockery::mock(Page::class)); - $scraphp->withHttpClient($httpClient); + ->andReturn($page); + + $logger->shouldReceive('info') + ->once() + ->with('Accessing http://localhost:8000/hello-world.php'); + + $logger->shouldReceive('info') + ->once() + ->with('Status: 200 http://localhost:8000/hello-world.php'); $pp->withScraPHP($scraphp); @@ -31,6 +50,4 @@ public function process(Page $page): void }); - - }); diff --git a/tests/ScrapPHPBuildTest.php b/tests/ScrapPHPBuildTest.php new file mode 100644 index 0000000..101fbba --- /dev/null +++ b/tests/ScrapPHPBuildTest.php @@ -0,0 +1,81 @@ +create(); + + expect($scraphp)->toBeInstanceOf(ScraPHP::class); + expect($scraphp->httpClient())->toBeInstanceOf(GuzzleHttpClient::class); + expect($scraphp->logger())->toBeInstanceOf(Logger::class); + expect($scraphp->writer())->toBeInstanceOf(JsonWriter::class); + expect($scraphp->retryCount())->toBe(3); + expect($scraphp->retryTime())->toBe(30); + +}); + +test('create a scraphp instance passing attributes', function () { + + $httpClient = Mockery::mock(HttpClient::class); + $logger = Mockery::mock(Logger::class); + $writer = Mockery::mock(Writer::class); + + $scraphp = ScraPHP::build() + ->withHttpClient($httpClient) + ->withLogger($logger) + ->withWriter($writer) + ->create(); + + expect($scraphp)->toBeInstanceOf(ScraPHP::class); + expect($scraphp->httpClient())->toBe($httpClient); + expect($scraphp->logger())->toBe($logger); + expect($scraphp->writer())->toBe($writer); +}); + +test('create a scraphp instance passing a filename for the logger', function () { + + $scraphp = ScraPHP::build() + ->withLogger('test.log') + ->create(); + + $filename = $scraphp->logger()->getHandlers()[0]->getUrl(); + + expect($filename)->toEndWith('test.log'); + +}); + +test('pass retryTime and retryCount', function () { + $scraphp = ScraPHP::build() + ->withRetryTime(15) + ->withRetryCount(5) + ->create(); + + expect($scraphp->retryTime())->toBe(15); + expect($scraphp->retryCount())->toBe(5); +}); + + +test('create a scraphp instance with webdriver', function () { + + $scraphp = ScraPHP::build() + ->withWebDriver('http://localhost:4444') + ->create(); + + expect($scraphp)->toBeInstanceOf(ScraPHP::class); + expect($scraphp->httpClient())->toBeInstanceOf(WebDriverHttpClient::class); +}); \ No newline at end of file diff --git a/tests/ScrapPHPTest.php b/tests/ScrapPHPTest.php index d30c344..a1b96aa 100644 --- a/tests/ScrapPHPTest.php +++ b/tests/ScrapPHPTest.php @@ -2,22 +2,25 @@ declare(strict_types=1); -use ScraPHP\HttpClient\Page; -use ScraPHP\ScraPHP; -use ScraPHP\ProcessPage; use Psr\Log\LoggerInterface; -use ScraPHP\Writers\JsonWriter; -use Scraphp\HttpClient\HttpClient; -use ScraPHP\HttpClient\Guzzle\GuzzlePage; use ScraPHP\Exceptions\HttpClientException; -use ScraPHP\HttpClient\Guzzle\GuzzleHttpClient; +use ScraPHP\HttpClient\Guzzle\GuzzlePage; +use Scraphp\HttpClient\HttpClient; +use ScraPHP\HttpClient\Page; +use ScraPHP\ProcessPage; +use ScraPHP\ScraPHP; +use ScraPHP\Writers\Writer; beforeEach(function () { $this->httpClient = Mockery::mock(HttpClient::class); - $this->httpClient->shouldReceive('withLogger')->andReturn($this->httpClient); - - $this->scraphp = new ScraPHP(); - $this->scraphp->withHttpClient($this->httpClient); + $this->logger = Mockery::mock(LoggerInterface::class); + $this->writer = Mockery::mock(Writer::class); + + $this->scraphp = new ScraPHP( + httpClient: $this->httpClient, + logger: $this->logger, + writer: $this->writer, + ); }); afterEach(function () { @@ -46,10 +49,18 @@ url: 'https://localhost:8000/teste.html' )); + $this->logger + ->shouldReceive('info') + ->with('Accessing https://localhost:8000/teste.html'); + + $this->logger + ->shouldReceive('info') + ->with('Status: 200 https://localhost:8000/teste.html'); + $this->scraphp->go('https://localhost:8000/teste.html', function (Page $page) { expect($page)->toBeInstanceOf(Page::class) ->htmlBody()->toBe('

Hello World

') - ->statusCode()->toBe(200) + ->statusCode()->toBe(200) ->headers()->toBe([]) ->url()->toBe('https://localhost:8000/teste.html'); @@ -69,24 +80,38 @@ url: 'https://localhost:8000/teste.html' )); + $this->logger + ->shouldReceive('info') + ->with('Accessing https://localhost:8000/teste.html'); + + $this->logger + ->shouldReceive('info') + ->with('Status: 200 https://localhost:8000/teste.html'); + $this->scraphp->go('https://localhost:8000/teste.html', function (Page $page) { expect($this)->toBeInstanceOf(ScraPHP::class); }); }); -test('default http client should be GuzzleHttpClient', function () { - $scraphp = new ScraPHP(); - expect($scraphp->httpClient())->toBeInstanceOf(GuzzleHttpClient::class); -}); - test('call fetch an asset from httpClient', function () { - $this->httpClient->shouldReceive('fetchAsset') + $this->httpClient + ->shouldReceive('fetchAsset') ->once() ->with('https://localhost:8000/texto.txt') ->andReturn('Hello World'); + $this->logger + ->shouldReceive('info') + ->once() + ->with('Fetching asset: https://localhost:8000/texto.txt'); + + $this->logger + ->shouldReceive('info') + ->once() + ->with('Fetched: https://localhost:8000/texto.txt'); + $content = $this->scraphp->fetchAsset('https://localhost:8000/texto.txt'); expect($content)->toBe('Hello World'); @@ -95,11 +120,22 @@ test('call save asset with default filename', function () { - $this->httpClient->shouldReceive('fetchAsset') + $this->httpClient + ->shouldReceive('fetchAsset') ->once() ->with('https://localhost:8000/texto.txt') ->andReturn('Hello World'); + $this->logger + ->shouldReceive('info') + ->once() + ->with('Fetching asset: https://localhost:8000/texto.txt'); + + $this->logger + ->shouldReceive('info') + ->once() + ->with('Fetched: https://localhost:8000/texto.txt'); + $file = $this->scraphp->saveAsset('https://localhost:8000/texto.txt', __DIR__.'/assets/'); expect($file)->toBeFile(); @@ -108,75 +144,66 @@ test('call save asset with custom filename', function () { - $this->httpClient->shouldReceive('fetchAsset') + $this->httpClient + ->shouldReceive('fetchAsset') ->once() ->with('https://localhost:8000/texto.txt') ->andReturn('Hello World'); + $this->logger + ->shouldReceive('info') + ->once() + ->with('Fetching asset: https://localhost:8000/texto.txt'); + + $this->logger + ->shouldReceive('info') + ->once() + ->with('Fetched: https://localhost:8000/texto.txt'); + $file = $this->scraphp->saveAsset('https://localhost:8000/texto.txt', __DIR__.'/assets/', 'my-filename.txt'); expect($file)->toBeFile(); expect(file_get_contents($file))->toBe('Hello World'); }); -test('log to a file', function () { - $scraphp = new ScraPHP([ - 'logger' => ['filename' => __DIR__.'/assets/log.txt'], - ]); - - $scraphp->logger()->debug('Teste'); - - expect(__DIR__.'/assets/log.txt')->toBeFile(); - expect(file_get_contents(__DIR__.'/assets/log.txt'))->toContain('Teste'); - -}); - -test('inject the logger into the writer', function () { - - $scraphp = new ScraPHP(); - - $scraphp->withWriter(new JsonWriter(__DIR__.'/assets/log.txt')); - - expect($scraphp->writer()->logger())->toBeInstanceOf(LoggerInterface::class); -}); +test('call class ProcessPage', function () { + $this->logger + ->shouldReceive('info') + ->with('Accessing https://localhost:8000/teste.html'); -test('call class ProcessPage', function () { + $this->logger + ->shouldReceive('info') + ->with('Status: 200 https://localhost:8000/teste.html'); - $httpClient = Mockery::mock(HttpClient::class); - $httpClient->shouldReceive('get') + $this->httpClient->shouldReceive('get') ->andReturn(new GuzzlePage( content: '

Hello World

', statusCode: 200, headers: [], url: 'https://localhost:8000/teste.html' )); - $httpClient->shouldReceive('withLogger')->once(); - $scraphp = new ScraPHP(); - $scraphp->withHttpClient($httpClient); - - $pp = Mockery::mock(ProcessPage::class); - $pp->shouldReceive('withScraPHP')->once()->with($scraphp); + $pp = Mockery::mock(ProcessPage::class); + $pp->shouldReceive('withScraPHP')->once()->with($this->scraphp); $pp->shouldReceive('process')->once(); - $scraphp->go('https://localhost:8000/teste.html', $pp); + $this->scraphp->go('https://localhost:8000/teste.html', $pp); }); - - test('retry get a url after a failed', function () { - $httpClient = Mockery::mock(HttpClient::class); - $httpClient + + $this->httpClient ->shouldReceive('get') ->times(3) - ->andReturnUsing(function () use ($httpClient) { + ->andReturnUsing(function () { static $counter = 0; - if($counter < 2) { + if ($counter < 2) { $counter++; throw new HttpClientException('test'); } + return new GuzzlePage( content: '

Hello World

', statusCode: 200, @@ -185,84 +212,75 @@ ); }); - $httpClient->shouldReceive('withLogger')->once(); - $scraphp = new ScraPHP(config:[ - 'httpclient' => [ - 'retry_count' => 3, - 'retry_time' => 1 - ] - ]); + $scraphp = new ScraPHP( + httpClient: $this->httpClient, + logger: $this->logger, + writer: $this->writer, + retryCount: 3, + retryTime: 1 + ); - $loggerMock = Mockery::mock(LoggerInterface::class); - $loggerMock->shouldReceive('error'); - $loggerMock->shouldReceive('info'); - $scraphp->withHttpClient($httpClient)->withLogger($loggerMock); + $this->logger->shouldReceive('error'); + $this->logger->shouldReceive('info'); $scraphp->go('http://localhost:8000/teste.html', function (Page $page) { }); - expect($scraphp->urlErrors())->toHaveCount(0); }); +test('save a failed url and its processor after tried 3 times', function () { + + $this->httpClient + ->shouldReceive('get') + ->times(3) + ->andThrows(new HttpClientException('test')); + $scraphp = new ScraPHP( + httpClient: $this->httpClient, + logger: $this->logger, + writer: $this->writer, + retryCount: 3, + retryTime: 1 + ); -test('save a failed url and its processor after tried 3 times', function () { - $httpClient = Mockery::mock(HttpClient::class); - $httpClient->shouldReceive('get')->times(3)->andThrows(new HttpClientException('test')); - - $httpClient->shouldReceive('withLogger')->once(); - $scraphp = new ScraPHP(config:[ - 'httpclient' => [ - 'retry_count' => 3, - 'retry_time' => 1 - ] - ]); - - $loggerMock = Mockery::mock(LoggerInterface::class); - $loggerMock->shouldReceive('error'); - $loggerMock->shouldReceive('info'); - $scraphp->withHttpClient($httpClient)->withLogger($loggerMock); + $this->logger->shouldReceive('error'); + $this->logger->shouldReceive('info'); $scraphp->go('http://localhost:8000/teste.html', function (Page $page) { }); - expect($scraphp->urlErrors()[0]['url'])->toContain('http://localhost:8000/teste.html'); expect($scraphp->urlErrors()[0]['pageProcessor'])->toBeInstanceOf(Closure::class); }); - test('retry get an asset if its fail', function () { - $httpClient = Mockery::mock(HttpClient::class); - $httpClient + $this->httpClient ->shouldReceive('fetchAsset') ->times(3) - ->andReturnUsing(function () use ($httpClient) { + ->andReturnUsing(function () { static $counter = 0; - if($counter < 2) { + if ($counter < 2) { $counter++; throw new HttpClientException('test'); } + return 'ABC'; }); - $httpClient->shouldReceive('withLogger')->once(); - $scraphp = new ScraPHP(config:[ - 'httpclient' => [ - 'retry_count' => 3, - 'retry_time' => 1 - ] - ]); - - $loggerMock = Mockery::mock(LoggerInterface::class); - $loggerMock->shouldReceive('error'); - $loggerMock->shouldReceive('info'); - $scraphp->withHttpClient($httpClient)->withLogger($loggerMock); + $scraphp = new ScraPHP( + httpClient: $this->httpClient, + logger: $this->logger, + writer: $this->writer, + retryCount: 3, + retryTime: 1 + ); + $this->logger->shouldReceive('error'); + $this->logger->shouldReceive('info'); $scraphp->fetchAsset('https://localhost:8000/teste.jpg'); @@ -270,57 +288,50 @@ }); - - test('save a failed url asset tried 3 times', function () { - $httpClient = Mockery::mock(HttpClient::class); - $httpClient->shouldReceive('fetchAsset') + + $this->httpClient + ->shouldReceive('fetchAsset') ->times(3) ->andThrows(new HttpClientException('test')); - $httpClient->shouldReceive('withLogger')->once(); - $scraphp = new ScraPHP(config:[ - 'httpclient' => [ - 'retry_count' => 3, - 'retry_time' => 1 - ] - ]); + $scraphp = new ScraPHP( + httpClient: $this->httpClient, + logger: $this->logger, + writer: $this->writer, + retryCount: 3, + retryTime: 1 + ); - $loggerMock = Mockery::mock(LoggerInterface::class); - $loggerMock->shouldReceive('error'); - $loggerMock->shouldReceive('info'); - $scraphp->withHttpClient($httpClient)->withLogger($loggerMock); + $this->logger->shouldReceive('error'); + $this->logger->shouldReceive('info'); $scraphp->fetchAsset('http://localhost:8000/teste.jpg'); - expect($scraphp->assetErrors()[0]['url'])->toContain('http://localhost:8000/teste.jpg'); }); - test('save a failed url asset tried 3 times on saveAsset', function () { - $httpClient = Mockery::mock(HttpClient::class); - $httpClient->shouldReceive('fetchAsset') + + $this->httpClient + ->shouldReceive('fetchAsset') ->times(3) ->andThrows(new HttpClientException('test')); - $httpClient->shouldReceive('withLogger')->once(); - $scraphp = new ScraPHP(config:[ - 'httpclient' => [ - 'retry_count' => 3, - 'retry_time' => 1 - ] - ]); + $scraphp = new ScraPHP( + httpClient: $this->httpClient, + logger: $this->logger, + writer: $this->writer, + retryCount: 3, + retryTime: 1 + ); - $loggerMock = Mockery::mock(LoggerInterface::class); - $loggerMock->shouldReceive('error'); - $loggerMock->shouldReceive('info'); - $scraphp->withHttpClient($httpClient)->withLogger($loggerMock); + $this->logger->shouldReceive('error'); + $this->logger->shouldReceive('info'); $scraphp->saveAsset('http://localhost:8000/teste.jpg', 'teste.jpg'); - expect($scraphp->assetErrors()[0]['url'])->toContain('http://localhost:8000/teste.jpg'); }); diff --git a/tests/Writers/CSVWriterTest.php b/tests/Writers/CSVWriterTest.php index a6beed2..f4dc635 100644 --- a/tests/Writers/CSVWriterTest.php +++ b/tests/Writers/CSVWriterTest.php @@ -8,7 +8,7 @@ beforeEach(function () { $this->logger = Mockery::mock(LoggerInterface::class); $this->writer = new CSVWriter(__DIR__.'/../assets/test.csv', ['name', 'lastname', 'age']); - $this->writer->withLogger($this->logger); + }); afterEach(function () { @@ -18,7 +18,6 @@ }); test('write csv line', function () { - $this->logger->shouldReceive('info')->once(); $this->writer->write([ 'name' => 'Rodrigo', @@ -33,7 +32,6 @@ }); test('write csv line with unsorted keys ', function () { - $this->logger->shouldReceive('info')->once(); $this->writer->write([ 'lastname' => 'Aramburu', 'age' => 25, @@ -48,8 +46,6 @@ test('check if a value exists in the csv file', function () { - $this->logger->shouldReceive('info')->times(3); - $this->writer->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -74,8 +70,6 @@ test('check if a value exists in the csv file with two criteria', function () { - $this->logger->shouldReceive('info')->times(3); - $this->writer->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -102,9 +96,7 @@ $this->logger = Mockery::mock(LoggerInterface::class); $this->writer = new CSVWriter(__DIR__.'/../assets/test.csv', ['name', 'lastname', 'age']); - $this->writer->withLogger($this->logger); - $this->logger->shouldReceive('info')->once(); $this->writer->write([ 'name' => 'Antonio', 'lastname' => 'Silva', @@ -125,13 +117,10 @@ })->throws(Exception::class, 'File '.__DIR__.'/../assets/test.csv'.' already exists with different header'); - test('write a recorde without a header', function () { unlink(__DIR__.'/../assets/test.csv'); - $this->logger->shouldReceive('info')->once(); - $writer = new CSVWriter(__DIR__.'/../assets/test.csv', ); - $writer->withLogger($this->logger); + $writer = new CSVWriter(__DIR__.'/../assets/test.csv'); $writer->write([ 'name' => 'Rodrigo', @@ -143,15 +132,13 @@ }); - test('ignore blank lines in exists method', function () { - $fp = fopen(__DIR__.'/../assets/test.csv', 'a'); fwrite($fp, "\n"); fwrite($fp, "Antonio,Silva,53\n"); fwrite($fp, "\n"); - fwrite($fp, "Gisele,Antunes,15"); + fwrite($fp, 'Gisele,Antunes,15'); expect($this->writer->exists(['name' => 'Antonio']))->toBeTrue(); expect($this->writer->exists(['name' => 'Gisele']))->toBeTrue(); diff --git a/tests/Writers/DatabaseWriterTest.php b/tests/Writers/DatabaseWriterTest.php index 3291a4c..e0b351d 100644 --- a/tests/Writers/DatabaseWriterTest.php +++ b/tests/Writers/DatabaseWriterTest.php @@ -16,19 +16,16 @@ ) SQL); - $this->logger = Mockery::mock(LoggerInterface::class); $this->writer = new DatabaseWriter( $this->pdo, 'users' ); - $this->writer->withLogger($this->logger); }); test('write a record to database', function () { - $this->logger->shouldReceive('info')->once(); $this->writer->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -49,7 +46,6 @@ test('check if a record exists in database', function () { - $this->logger->shouldReceive('info')->times(3); $this->writer->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -75,7 +71,6 @@ test('check if a record exists in database with two criteria', function () { - $this->logger->shouldReceive('info')->times(3); $this->writer->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', diff --git a/tests/Writers/JsonWriterTest.php b/tests/Writers/JsonWriterTest.php index 7e3b4e4..99daf0e 100644 --- a/tests/Writers/JsonWriterTest.php +++ b/tests/Writers/JsonWriterTest.php @@ -9,7 +9,6 @@ /** @var LoggerInterface */ $this->logger = Mockery::mock(LoggerInterface::class); $this->jsonWriter = new JsonWriter(__DIR__.'/../assets/test.json'); - $this->jsonWriter->withLogger($this->logger); }); afterEach(function () { @@ -20,7 +19,6 @@ test('write a json file', function () { - $this->logger->shouldReceive('info')->once(); $this->jsonWriter->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -39,7 +37,6 @@ test('check if a value exists in the jsons file', function () { - $this->logger->shouldReceive('info')->times(3); $this->jsonWriter->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -64,7 +61,6 @@ test('check if a value exists in the json file with two criteria', function () { - $this->logger->shouldReceive('info')->times(3); $this->jsonWriter->write([ 'name' => 'Rodrigo', 'lastname' => 'Aramburu', @@ -95,7 +91,6 @@ ], ])); - $this->logger->shouldReceive('info')->once(); $this->jsonWriter->write([ 'name' => 'Antonio', 'lastname' => 'Silva',