Skip to content

Commit 54965fc

Browse files
author
Mindaugas Kasparavicius
committed
add new websites, update code with new crawling logic
1 parent 1cf2753 commit 54965fc

30 files changed

+500
-363
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,6 @@ composer.lock
44
storage/logs
55
storage/cache
66
.idea
7-
vendor
7+
vendor
8+
node_modules
9+
yarn.lock

README.md

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,21 +10,12 @@ Note: tested on Ubuntu 20.04 only.
1010
2. Run `sudo apt install sox`
1111
3. Run `sudo apt install libsox-fmt-mp3`
1212
4. Run `sudo apt-get install sendmail`
13-
5. Install required dependencies by running command `composer install`
14-
6. Copy configuration file by running command `cp .env.example .env`
15-
7. Set your desired configuration presets for .env file
16-
8. Start web scan by running `php ./app app:scan:web `
17-
18-
## Architecture
19-
20-
1. app.php (script entry, bootstraps all application code and starts ScanWebCommand)
21-
2. ScanWebCommand (initiates scan)
22-
1. Manager (iterates over website list and initiates crawling)
23-
1. Websites[] (goes over website and fetches stock data)
24-
2. Mailer
25-
3. Logger
26-
4. Notification sound
27-
13+
5. Run `yarn install`
14+
6. Install required dependencies by running command `composer install`
15+
7. Copy configuration file by running command `cp .env.example .env`
16+
8. Set your desired configuration presets for .env file
17+
9. Start web scan by running `php ./app app:scan:web `
18+
2819
## Dependencies
2920

3021
1. PHP 7.2
@@ -42,6 +33,8 @@ Note: tested on Ubuntu 20.04 only.
4233
1. skytech.lt
4334
2. Varle.lt
4435
3. kilobaitas.lt
36+
4. Topocentras.lt
37+
5. Kaina24.lt
4538

4639
## How to add more stores support
4740

app.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
use Symfony\Component\Console\Application;
77
use Symfony\Component\Dotenv\Dotenv;
88
use App\Container\ContainerBindings;
9-
use App\Console\ScanWebCommand;
9+
use App\Console\CrawlCommand;
1010

1111
$dotenv = new Dotenv();
1212
$dotenv->load(__DIR__.'/.env');
@@ -15,5 +15,5 @@
1515
$bindings->bind();
1616

1717
$application = new Application();
18-
$application->add($bindings->getContainer()->get(ScanWebCommand::class));
18+
$application->add($bindings->getContainer()->get(CrawlCommand::class));
1919
$application->run();

app/Console/ScanWebCommand.php renamed to app/Console/CrawlCommand.php

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,18 @@
55

66
use App\Services\Logger\Logger;
77
use App\Services\Mail\Mailer;
8-
use App\Services\ScanManager\Manager;
9-
use App\Services\ScanManager\ScanResultInterface;
8+
use App\Services\CrawlerManager\CrawlerManager;
109
use App\Services\Websites\Data\Product;
1110
use Carbon\Carbon;
11+
use Psr\Http\Message\UriInterface;
1212
use Symfony\Component\Console\Command\Command;
1313
use Symfony\Component\Console\Input\InputInterface;
1414
use Symfony\Component\Console\Output\OutputInterface;
1515
use Throwable;
1616
use Monolog\Logger as MonoLog;
1717
use Twig\Environment;
1818

19-
class ScanWebCommand extends Command
19+
class CrawlCommand extends Command
2020
{
2121
protected static $defaultName = 'app:scan:web';
2222

@@ -26,9 +26,9 @@ class ScanWebCommand extends Command
2626
private $twig;
2727
private $recipient;
2828

29-
private $scanIterations = 0;
29+
private $iterations = 0;
3030

31-
public function __construct(Manager $manager, Logger $logger, Mailer $mailer, Environment $twig, string $emailRecipient)
31+
public function __construct(CrawlerManager $manager, Logger $logger, Mailer $mailer, Environment $twig, string $emailRecipient)
3232
{
3333
parent::__construct();
3434
$this->manager = $manager;
@@ -38,24 +38,24 @@ public function __construct(Manager $manager, Logger $logger, Mailer $mailer, En
3838
$this->recipient = $emailRecipient;
3939
}
4040

41-
4241
protected function execute(InputInterface $input, OutputInterface $output): int
4342
{
4443
try {
45-
$output->writeln(sprintf('%s: Scanning. Iteration #%s.', Carbon::now()->toDateTimeString(), ++$this->scanIterations));
44+
$output->writeln(sprintf('%s: Crawling. Iteration #%s.', Carbon::now()->toDateTimeString(), ++$this->iterations));
45+
46+
$result = [];
47+
$this->manager->crawl(
48+
function (UriInterface $url, array $products) use (&$result, $output) {
49+
$output->writeln(sprintf('%s: Parsed website %s.', Carbon::now()->toDateTimeString(), $url->getHost()));
50+
$result = array_merge($result, $products);
51+
}
52+
);
4653

47-
$result = $this->manager->scan();
48-
if ($result->getProductCount() !== 0) {
49-
$output->writeln(sprintf('%s: %s Products found.', Carbon::now()->toDateTimeString(), $result->getProductCount()));
50-
$this->parseResult($result);
51-
$this->playSound();
52-
} else {
53-
$output->writeln(sprintf('%s: No updates found yet.', Carbon::now()->toDateTimeString()));
54-
}
54+
!empty($result)
55+
? $this->notifyAboutProducts($result)
56+
: $output->writeln(sprintf('%s: No results found.', Carbon::now()->toDateTimeString()));
5557

56-
$rescanTime = $this->manager->getRescanTimeSeconds();
57-
$output->writeln(sprintf('%s: Sleeping for %s seconds.', Carbon::now()->toDateTimeString(), $rescanTime));
58-
sleep($rescanTime);
58+
sleep(10);
5959

6060
return $this->execute($input, $output);
6161
} catch (Throwable $e) {
@@ -66,9 +66,10 @@ protected function execute(InputInterface $input, OutputInterface $output): int
6666
}
6767
}
6868

69-
private function parseResult(ScanResultInterface $result): void
69+
private function notifyAboutProducts(array $products): void
7070
{
71-
$products = $result->getProducts();
71+
//$this->playSound();
72+
7273
$html = $this->twig->render('index.html', ['products' => $products]);
7374

7475
$this->mailer->send(

app/Container/ContainerBindings.php

Lines changed: 33 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,21 @@
33

44
namespace App\Container;
55

6-
use App\Console\ScanWebCommand;
6+
use App\Console\CrawlCommand;
77
use App\Services\Cache\ProductCache;
8+
use App\Services\Crawler\CrawlerObserver;
89
use App\Services\Logger\Logger;
910
use App\Services\Mail\Mailer;
10-
use App\Services\ScanManager\Manager;
11+
use App\Services\CrawlerManager\CrawlerManager;
12+
use App\Services\Websites\Kaina24Website;
13+
use App\Services\Websites\KilobaitasWebsite;
14+
use App\Services\Websites\SkytechWebsite;
15+
use App\Services\Websites\TopoCentrasWebsite;
16+
use App\Services\Websites\VarleWebsite;
1117
use Symfony\Component\DependencyInjection\ContainerBuilder;
1218
use Symfony\Component\DependencyInjection\TaggedContainerInterface;
1319
use Twig\Environment;
1420
use Twig\Loader\FilesystemLoader;
15-
use FilesystemIterator;
1621

1722
class ContainerBindings
1823
{
@@ -33,59 +38,42 @@ public function bind(): void
3338
$this->container->register(Logger::class, Logger::class);
3439
$this->container->register(Mailer::class, Mailer::class);
3540
$this->container->register(ProductCache::class, ProductCache::class);
41+
$this->container
42+
->register(CrawlerObserver::class, CrawlerObserver::class)
43+
->addArgument($this->container->get(ProductCache::class));
44+
3645

3746
$loader = new FilesystemLoader(getcwd() . '/views/email');
3847
$twig = new Environment($loader, ['cache' => getcwd() . '/storage/cache']);
3948
$this->container->set(get_class($twig), $twig);
4049

41-
$this->registerManagerWebsites();
42-
43-
$this->container
44-
->register(ScanWebCommand::class, ScanWebCommand::class)
45-
->addArgument($this->container->get(Manager::getContainerId()))
46-
->addArgument($this->container->get(Logger::class))
47-
->addArgument($this->container->get(Mailer::class))
48-
->addArgument($this->container->get(get_class($twig)))
49-
->addArgument($_ENV['NOTIFICATION_EMAIL'])
50-
;
51-
}
52-
53-
private function registerManagerWebsites(): void
54-
{
5550
$manager = $this->container
56-
->register(Manager::getContainerId(), Manager::class)
51+
->register(CrawlerManager::class, CrawlerManager::class)
5752
->addArgument($this->container->get(ProductCache::class))
58-
->addArgument($_ENV['MANAGER_RESCAN_TIME_SECONDS'] ?? 60)
59-
;
60-
61-
$params = array_filter(explode(',', $_ENV['SCAN_KEYWORDS'] ?? ','));
62-
$dir = getcwd() . '/app/Services/Websites';
53+
->addArgument($this->container->get(CrawlerObserver::class))
54+
->addArgument($this->container->get(Logger::class));
6355

64-
$iterator = new FilesystemIterator($dir);
65-
foreach ($iterator as $file) {
66-
$filename = $file->getFileName();
67-
if ($file->getExtension() !== 'php') {
68-
continue;
69-
}
70-
if (strpos($filename, 'Abstract') !== false) {
71-
continue;
72-
}
73-
if (strpos($filename, 'Interface') !== false) {
74-
continue;
75-
}
76-
if (strpos($filename, 'Website') === false) {
77-
continue;
78-
}
56+
$keywords = array_filter(explode(',', $_ENV['SCAN_KEYWORDS'] ?? ','));
7957

80-
$fileParts = explode('.', $filename);
81-
$objPath = 'App\Services\Websites\\' . $fileParts[0];
58+
$websites[] = $this->container->register(VarleWebsite::class, VarleWebsite::class);
59+
$websites[] = $this->container->register(KilobaitasWebsite::class, KilobaitasWebsite::class);
60+
$websites[] = $this->container->register(SkytechWebsite::class, SkytechWebsite::class);
61+
$websites[] = $this->container->register(TopoCentrasWebsite::class, TopoCentrasWebsite::class);
62+
$websites[] = $this->container->register(Kaina24Website::class, Kaina24Website::class);
8263

83-
$website = $this->container->register($objPath, $objPath);
84-
foreach ($params as $param) {
85-
$website->addMethodCall('addKeyword', [$param]);
86-
}
8764

88-
$manager->addMethodCall('addWebsite', [$this->container->get($objPath)]);
65+
foreach ($websites as $website) {
66+
$website->addMethodCall('setKeywords', [$keywords]);
8967
}
68+
69+
$manager->addMethodCall('setWebsites', [$websites]);
70+
71+
$this->container
72+
->register(CrawlCommand::class, CrawlCommand::class)
73+
->addArgument($this->container->get(CrawlerManager::class))
74+
->addArgument($this->container->get(Logger::class))
75+
->addArgument($this->container->get(Mailer::class))
76+
->addArgument($this->container->get(get_class($twig)))
77+
->addArgument($_ENV['NOTIFICATION_EMAIL']);
9078
}
9179
}

app/Services/Cache/AbstractCache.php

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<?php
2+
declare(strict_types=1);
3+
4+
namespace App\Services\Cache;
5+
6+
use Symfony\Component\Cache\Adapter\FilesystemAdapter;
7+
8+
abstract class AbstractCache
9+
{
10+
protected $cache;
11+
12+
public function __construct()
13+
{
14+
$this->cache = new FilesystemAdapter($this->getCacheNameSpace(), 1800, $this->getCacheDir());
15+
}
16+
17+
abstract public function getCacheNameSpace(): string;
18+
19+
protected function getCacheDir(): string
20+
{
21+
return getcwd() . '/storage/cache';
22+
}
23+
24+
public function clear(): AbstractCache
25+
{
26+
$this->cache->clear();
27+
return $this;
28+
}
29+
}

app/Services/Cache/ProductCache.php

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,39 +4,28 @@
44
namespace App\Services\Cache;
55

66
use App\Services\Websites\Data\Product;
7-
use Symfony\Component\Cache\Adapter\FilesystemAdapter;
87
use Psr\Cache\InvalidArgumentException;
98

10-
class ProductCache
9+
class ProductCache extends AbstractCache
1110
{
12-
private $cache;
13-
14-
public function __construct()
11+
public function getCacheNameSpace(): string
1512
{
16-
$this->cache = new FilesystemAdapter('products', 1800, $this->getCacheDir());
13+
return 'products';
1714
}
1815

1916
public function getCacheKeyForProduct(Product $product): string
2017
{
2118
return md5($product->getUrl());
2219
}
2320

24-
public function getCacheDir(): string
25-
{
26-
return getcwd() . '/storage/cache';
27-
}
28-
2921
/**
3022
* @throws InvalidArgumentException
3123
*/
3224
public function cacheProduct(Product $product): void
3325
{
34-
$this->cache->get(
35-
$this->getCacheKeyForProduct($product),
36-
function () use ($product) {
37-
return $product;
38-
}
39-
);
26+
$item = $this->cache->getItem($this->getCacheKeyForProduct($product));
27+
$item->set($product);
28+
$this->cache->save($item);
4029
}
4130

4231
/**
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?php
2+
declare(strict_types=1);
3+
4+
namespace App\Services\Crawler;
5+
6+
use App\Services\Cache\ProductCache;
7+
use GuzzleHttp\Exception\RequestException;
8+
use Psr\Http\Message\ResponseInterface;
9+
use Psr\Http\Message\UriInterface;
10+
use Spatie\Crawler\CrawlObserver;
11+
12+
abstract class AbstractCrawler extends CrawlObserver
13+
{
14+
protected $cache;
15+
protected $crawledCallback;
16+
17+
public function __construct(ProductCache $cache)
18+
{
19+
$this->cache = $cache;
20+
}
21+
22+
public function setCrawledCallback(callable $callback): AbstractCrawler
23+
{
24+
$this->crawledCallback = $callback;
25+
return $this;
26+
}
27+
28+
abstract public function crawled(UriInterface $url, ResponseInterface $response, ?UriInterface $foundOnUrl = null);
29+
30+
public function crawlFailed(UriInterface $url, RequestException $requestException, ?UriInterface $foundOnUrl = null)
31+
{
32+
throw $requestException;
33+
}
34+
}

0 commit comments

Comments
 (0)