Skip to content
This repository was archived by the owner on Nov 25, 2023. It is now read-only.

Commit

Permalink
refactor to mime-based content index #1
Browse files Browse the repository at this point in the history
  • Loading branch information
ghost committed May 10, 2023
1 parent 272a885 commit db0e66c
Show file tree
Hide file tree
Showing 13 changed files with 304 additions and 1,267 deletions.
17 changes: 3 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Could be enabled or disabled by `API_SEARCH_ENABLED` option
```
GET action=search - required
GET query={string} - optional, search request, empty if not provided
GET type={string} - optional, search type, image|default or empty
GET type={string} - optional, filter mime type of available or empty
GET page={int} - optional, search results page, 1 if not provided
GET mode=SphinxQL - optional, enable extended SphinxQL syntax
```
Expand Down Expand Up @@ -141,7 +141,7 @@ GET m=SphinxQL
##### Basic features

* [x] Web pages full text ranking search
* [x] Images search with safe proxy preview support
* [x] MIME filtering search with safe proxy images preview
* [x] Extended syntax support
* [x] Flexible settings compatible with IPv4/IPv6 networks

Expand All @@ -159,18 +159,14 @@ GET m=SphinxQL
* [ ] Index API
+ [x] Manifest
+ [x] Search
+ [x] Pages
+ [x] Images
+ [x] Hosts
+ [ ] Pages
+ [ ] Images
+ [ ] MIME list
* [ ] Context advertising API

##### Crawler

* [x] Auto crawl links by regular expression rules
+ [x] Pages
+ [x] Images
+ [x] Manifests
* [x] Robots.txt / robots meta tags support (#2)
* [x] Specific rules configuration for every host
Expand All @@ -181,8 +177,6 @@ GET m=SphinxQL
* [x] Ban non-condition links to prevent extra requests
* [x] Debug log
* [x] History snaps
+ [x] Pages
+ [x] Images
* [ ] Indexing new sites homepage in higher priority
* [ ] Redirect codes extended processing
* [ ] Palette image index / filter
Expand All @@ -191,17 +185,12 @@ GET m=SphinxQL
##### Cleaner
* [x] Deprecated DB items auto deletion / host settings update
+ [x] Pages
+ [x] Images
+ [x] Manifests
+ [x] Logs
+ [x] Crawler
+ [x] Cleaner
* [x] Deprecated history snaps removing
+ [x] Pages
+ [x] Images
* [x] Banned resources reset by timeout
+ [x] Pages
+ [x] Images
* [x] Debug log

##### Other
Expand Down
88 changes: 7 additions & 81 deletions config/app.php.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,14 @@ error_reporting(E_ALL);
* Project domain, without slash on postfix
*
*/
define('WEBSITE_DOMAIN', (!empty($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on' ? 'https' : 'http') . '://' . (!empty($_SERVER['HTTP_HOST']) ? $_SERVER['HTTP_HOST'] : ''));
define('WEBSITE_DOMAIN', '');

/*
* Page search results before show the read more link
*
*/
define('WEBSITE_PAGINATION_SEARCH_PAGE_RESULTS_LIMIT', 100);

/*
* Image search results before show the read more link
*
*/
define('WEBSITE_PAGINATION_SEARCH_IMAGE_RESULTS_LIMIT', 10);

/*
* Quantity of related pages for each image in the search results
*
*/
define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);

/*
* Save ident icons to the static webp cache (placed in storage/cache) to prevent CPU overload
*
Expand All @@ -76,7 +64,7 @@ define('WEBSITE_SEARCH_IMAGE_RELATED_PAGE_RESULTS_LIMIT', 5);
define('WEBSITE_IDENTICON_IMAGE_CACHE', true);

// Database
define('DB_HOST', 'localhost');
define('DB_HOST', '127.0.0.1');
define('DB_PORT', 3306);
define('DB_NAME', '');
define('DB_USERNAME', '');
Expand Down Expand Up @@ -144,20 +132,6 @@ define('CRAWL_STOP_DISK_QUOTA_MB_LEFT', 500);
*/
define('CRAWL_PAGE_LIMIT', 20);

/*
* Images (URI) processing limit in the crawler.php queue
*
* This option related to CRAWL_IMAGE_SECONDS_OFFSET value
* and the crontab task frequency (https://github.com/YGGverse/YGGo#crontab)
*
* Usually up to 20 pages per minute,
* to prevent websites overload by sending GET crawling requests
*
* Set 0 to disable
*
*/
define('CRAWL_IMAGE_LIMIT', 10);

/*
* Manifest (URI) processing limit in the crawler.php queue
*
Expand Down Expand Up @@ -194,28 +168,7 @@ define('CRAWL_PAGE_SECONDS_OFFSET', 60*60*24*30*12);
* comma separated
*
*/
define('CRAWL_PAGE_MIME', 'text/html');

/*
* Index images match MIME types
*
* comma separated
*
*/
define('CRAWL_IMAGE_MIME', 'image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');

/*
* Renew image index by timing offset provided
*
* This option works with CRAWL_IMAGE_LIMIT step queue
*
* Pay attention, that CRAWL_IMAGE_LIMIT + CRAWL_IMAGE_SECONDS_OFFSET pair
* must have enough value to crawl all images collected in the DB index
*
* or the crawler can stuck in queue
*
*/
define('CRAWL_IMAGE_SECONDS_OFFSET', 60*60*24*30*12);
define('CRAWL_PAGE_MIME', 'text/html,application/xhtml+xml,text/plain,image/webp,image/png,image/gif,image/jpeg,image/ico,image/svg+xml');

/*
* Renew manifests index by timing offset provided
Expand All @@ -234,7 +187,7 @@ define('CRAWL_MANIFEST_SECONDS_OFFSET', 60*60*24*30);
* Only URL addresses match this rule will be auto-crawled
*
*/
define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/ui'
define('CRAWL_URL_REGEXP', '/^http:\/\/\[[\w:]+\].*$/ui');

/*
* Pages limit per new host by default
Expand All @@ -244,7 +197,7 @@ define('CRAWL_URL_REGEXP', '/^.*$/ui'); // ipv6 only '/^http:\/\/\[[\w:]+\].*$/u
* Custom rule for specified host could be provided in the DB `host`.`crawlPageLimit` field
*
*/
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 1000);
define('CRAWL_HOST_DEFAULT_PAGES_LIMIT', 100000);

/*
* Set default auto-crawl status for new host added
Expand All @@ -264,7 +217,6 @@ define('CRAWL_HOST_DEFAULT_STATUS', true);
* Custom rule for specified host could be provided in the DB `host`.`crawlMetaOnly` field
*
* This option able to change search results relevance
* This option enables image data caching in base64
*
*/
define('CRAWL_HOST_DEFAULT_META_ONLY', false);
Expand All @@ -279,16 +231,6 @@ define('CRAWL_HOST_DEFAULT_META_ONLY', false);
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);

/*
* Not suitable/safe for work status for new host by default
*
* Could be filtered in crawl conditions or search results
*
* Custom rule for specified host could be provided in the DB `host`.`nsfw` field
*
*/
define('CRAWL_HOST_DEFAULT_NSFW', false);

/*
* Default robots.txt rules on remote file not exists
* The crawler able to overwrite these rules
Expand Down Expand Up @@ -324,7 +266,7 @@ define('CRAWL_MANIFEST', true);
* Manifest API version compatibility
*
*/
define('CRAWL_MANIFEST_API_VERSION', 0.7);
define('CRAWL_MANIFEST_API_VERSION', 0.8);

/*
* Set default auto-crawl status for new manifest added
Expand Down Expand Up @@ -389,20 +331,6 @@ define('CLEAN_PAGE_BAN_SECONDS_OFFSET', 60*60*24*30);
*/
define('CLEAN_PAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);

/*
* Remove image ban after following time
*
* This option used in crawler and search page
* to prevent extra http requests to unavailable or not condition resources
*
*/
define('CLEAN_IMAGE_BAN_SECONDS_OFFSET', 60*60*24*30);

/*
* Remove image description history after following time
*
*/
define('CLEAN_IMAGE_DESCRIPTION_OFFSET', 60*60*24*30*12*10);

// API settings

Expand Down Expand Up @@ -445,14 +373,12 @@ define('API_HOSTS_FIELDS',
`host`.`name`,
`host`.`port`,
`host`.`crawlPageLimit`,
`host`.`crawlImageLimit`,
`host`.`robots`,
`host`.`robotsPostfix`,
`host`.`nsfw`,
`host`.`timeAdded`,
`host`.`timeUpdated`,
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`,
(SELECT COUNT(*) FROM `hostImage` WHERE `hostImage`.`hostId` = `host`.`hostId`) AS `hostImagesTotal`'); // string: *|field names comma separated
(SELECT COUNT(*) FROM `hostPage` WHERE `hostPage`.`hostId` = `host`.`hostId`) AS `hostPagesTotal`');

/*
* Manifest API
Expand Down
53 changes: 17 additions & 36 deletions config/sphinx.conf.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,48 +12,29 @@ source common
source hostPage : common
{
sql_query = \
SELECT hostPage.hostPageId, \
hostPage.rank, \
hostPage.uri, \
host.name, \
(SELECT CONCAT_WS(' ', hostPageDescription.metaTitle, \
hostPageDescription.metaDescription, \
hostPageDescription.metaKeywords) \
FROM hostPageDescription \
WHERE hostPageDescription.hostPageId = hostPage.hostPageId \
ORDER BY hostPageDescription.timeUpdated DESC, hostPageDescription.timeAdded DESC \
LIMIT 1) AS pageDescription \
FROM hostPage \
JOIN host ON (host.hostId = hostPage.hostId) \
WHERE host.status = '1' AND hostPage.httpCode = 200 AND hostPage.timeBanned IS NULL
SELECT `hostPage`.`hostPageId`, \
`hostPage`.`uri`, \
`host`.`name`, \
REGEXP_REPLACE(`hostPage`.`mime`, '^[A-z-]+/([A-z-]+).*', '$1') AS `mime`, \
(SELECT COUNT(*) FROM `hostPageToHostPage` \
WHERE `hostPageToHostPage`.`hostPageIdTarget` = `hostPage`.`hostPageId` \
AND `hostPageToHostPage`.`hostPageIdSource` <> `hostPage`.`hostPageId`) AS `rank`, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', `hostPageDescription`.`title`, \
`hostPageDescription`.`description`, \
`hostPageDescription`.`keywords`)) \
FROM `hostPageDescription` \
WHERE `hostPageDescription`.`hostPageId` = `hostPage`.`hostPageId`) AS `pageDescription` \
FROM `hostPage` \
JOIN `host` ON (`host`.`hostId` = `hostPage`.`hostId`) \
WHERE `host`.`status` = '1' AND `hostPage`.`httpCode` = 200 AND `hostPage`.`timeBanned` IS NULL

sql_attr_uint = rank
}

source hostImage : common
{
sql_query = \
SELECT hostImage.hostImageId, hostImage.rank, hostImage.uri, host.name, \
(SELECT GROUP_CONCAT(CONCAT_WS(' ', hostImageDescription.alt, hostImageDescription.title)) \
FROM hostImageDescription \
WHERE hostImageDescription.hostImageId = hostImage.hostImageId) AS imageDescription \
FROM hostImage \
JOIN host ON (host.hostId = hostImage.hostId) \
WHERE host.status = '1' AND hostImage.httpCode = 200 AND hostImage.timeBanned IS NULL \

sql_attr_uint = rank
sql_attr_uint = rank
sql_attr_string = mime
}

index hostPage
{
source = hostPage
morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostPage
}

index hostImage
{
source = hostImage
morphology = stem_enru, stem_cz, stem_ar
path = /var/lib/sphinxsearch/data/hostImage
}
Loading

0 comments on commit db0e66c

Please sign in to comment.