From 52346ab70317c72ec4f8ccd62ff86dc20155d3da Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Tue, 9 Jan 2024 11:49:51 +0100 Subject: [PATCH] Adds detection for various bots and improves detection for Googlebot (#7551) * Improves detection for Googlebot * Update Googlebot urls * Adds detection for Spawning AI * Improves generic bot regex * Adds detection for Domain Research Project * Adds detection for VK Robot * Improves generic bot regex --- Tests/DeviceDetectorTest.php | 4 +- Tests/Parser/BotTest.php | 4 +- Tests/fixtures/bots.yml | 221 +++++++++++++++++++++-------------- regexes/bots.yml | 44 +++++-- 4 files changed, 171 insertions(+), 102 deletions(-) diff --git a/Tests/DeviceDetectorTest.php b/Tests/DeviceDetectorTest.php index faeefe9992..bb785c5c88 100644 --- a/Tests/DeviceDetectorTest.php +++ b/Tests/DeviceDetectorTest.php @@ -449,10 +449,10 @@ public function testGetInfoFromUABot(): void 'bot' => [ 'name' => 'Googlebot', 'category' => 'Search bot', - 'url' => 'http://www.google.com/bot.html', + 'url' => 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers', 'producer' => [ 'name' => 'Google Inc.', - 'url' => 'http://www.google.com', + 'url' => 'https://www.google.com/', ], ], ]; diff --git a/Tests/Parser/BotTest.php b/Tests/Parser/BotTest.php index 9252ddc2d0..a4133cff78 100644 --- a/Tests/Parser/BotTest.php +++ b/Tests/Parser/BotTest.php @@ -22,10 +22,10 @@ public function testGetInfoFromUABot(): void $expected = [ 'name' => 'Googlebot', 'category' => 'Search bot', - 'url' => 'http://www.google.com/bot.html', + 'url' => 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers', 'producer' => [ 'name' => 'Google Inc.', - 'url' => 'http://www.google.com', + 'url' => 'https://www.google.com/', ], ]; $botParser = new Bot(); diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 7fdd39feef..08e2805b99 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -1201,307 +1201,307 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: AdsBot-Google-Mobile (+http://www.google.com/mobile/adsbot.html) Mozilla (iPhone; U; CPU iPhone OS 3 0 like Mac OS X) AppleWebKit (KHTML, like Gecko) Mobile Safari bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 19 subscribers; feed-id=13965549748850348809) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 2 subscribers; feed-id=17860707833818568603) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 375 subscribers; feed-id=15381863289700640853) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Google-AdWords-Express bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Google-Adwords-Instant (+http://www.google.com/adsbot.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Google-speakr bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Googlebot (gocrawl v0.4) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Googlebot-Image/1.0 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Googlebot-News (2.3.3, ruby 1.9.3 (2013-11-22)) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Googlebot-Video/1.0 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Googlebot/2.1 (http://www.googlebot.com/bot.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Googlebot/Nutch-1.7 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: GoogleProducer; (+http://goo.gl/7y4SX) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mediapartners-Google bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (compatible) Feedfetcher-Google;(+http://www.google.com/feedfetcher.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (compatible; Google-Youtube-Links) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (en-US) AppleWebKit/537.36 (KHTML, like Gecko; Google-Assess) Chrome/34.0.1847.116 Safari/537.36 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (en-us) AppleWebKit/537.36(KHTML, like Gecko; Google-Adwords-DisplayAds-WebRender;) Chrome/41.0.2272.118Safari/537.36 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/537.36 (KHTML, like Gecko; Google-Publisher-Plugin) Chrome/27.0.1453 Mobile Safari/537.36 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Google-AMPHTML) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Google-Read-Aloud; +https://support.google.com/webmasters/answer/1061943) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012; DuplexWeb-Google/1.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20110814 Firefox/6.0 Google (+https://developers.google.com/+/web/snippet/) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Safari/537.36 (compatible; Google-HotelAdsVerifier/2.0) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview) Chrome/27.0.1453 Safari/537.36 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36 (compatible; Google-Shopping-Quality +http://www.google.com/merchants/tos/extend/DE/tos.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36 bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html) bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Grammarly/1.0 (http://www.grammarly.com) bot: @@ -3123,10 +3123,10 @@ bot: name: VK Share Button category: Crawler - url: http://vk.com/dev/Share + url: https://dev.vk.com/en/widgets/share producer: name: VK - url: http://vk.com/ + url: https://vk.com/ - user_agent: 'Mozilla/4.0 (compatible; Vagabondo/4.0; http://webagent.wise-guys.nl/; http://www.wise-guys.nl/)' bot: @@ -3932,10 +3932,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; BDCbot/1.0; +http://bigweb.bigdatacorp.com.br/faq.aspx) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 bot: @@ -4605,10 +4605,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: cortex/1.0 bot: @@ -4618,10 +4618,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (compatible; TigerBot/8.69; Tiger.ch) bot: @@ -4663,10 +4663,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: CrowdTanglebot/1.0 bot: @@ -5446,10 +5446,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 (iplabel; Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36 bot: @@ -5555,10 +5555,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: 'hackermention ' bot: @@ -5774,10 +5774,10 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot) bot: @@ -5792,19 +5792,19 @@ bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: GoogleOther bot: name: Googlebot category: Search bot - url: http://www.google.com/bot.html + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers producer: name: Google Inc. - url: http://www.google.com + url: https://www.google.com/ - user_agent: Ant.com beta/1.0 (Crawling antmarks; https://www.ant.com; admin@ant.com) bot: @@ -6226,3 +6226,48 @@ producer: name: Yandex LLC url: https://yandex.com/company/ +- + user_agent: Google-Safety + bot: + name: Googlebot + category: Search bot + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers + producer: + name: Google Inc. + url: https://www.google.com/ +- + user_agent: Google-Extended + bot: + name: Googlebot + category: Search bot + url: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers + producer: + name: Google Inc. + url: https://www.google.com/ +- + user_agent: Spawning-AI + bot: + name: Spawning AI + category: Crawler + url: https://spawning.ai/ + producer: + name: Spawning, Inc + url: https://spawning.ai/ +- + user_agent: domain research project (+https://trentwil.es/domains.html) + bot: + name: Domain Research Project + category: Crawler + url: https://trentwil.es/domains.html + producer: + name: Trent Wiles + url: https://trentwil.es/ +- + user_agent: Mozilla/5.0 (compatible; VKRobot/1.0) + bot: + name: VK Robot + category: Crawler + url: https://dev.vk.com/en/ + producer: + name: VK + url: https://vk.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 39ef596d32..2941318346 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -744,21 +744,21 @@ name: 'Visual Meta' url: 'https://www.shopalike.cz/' -- regex: 'AdsBot-Google|Adwords-(DisplayAds|Express|Instant)|Google Web Preview|Google[ -]Publisher[ -]Plugin|Google-(Ads-Conversions|Ads-Qualify|Adwords|AMPHTML|Assess|HotelAdsVerifier|InspectionTool|Read-Aloud|Shopping-Quality|Site-Verification|speakr|Stale-Content-Probe|Test|Youtube-Links)|(APIs|DuplexWeb|Feedfetcher|Mediapartners)-Google|Googlebot|Google(?:AdSenseInfeed|AssociationService|Other|Prober|Producer)|Google.*/\+/web/snippet' +- regex: 'Adwords-(?:DisplayAds|Express|Instant)|Google Web Preview|Google[ -]Publisher[ -]Plugin|Google-(?:Ads-Conversions|Ads-Qualify|Adwords|AMPHTML|Assess|Extended|HotelAdsVerifier|InspectionTool|Read-Aloud|Safety|Shopping-Quality|Site-Verification|speakr|Stale-Content-Probe|Test|Youtube-Links)|(?:AdsBot|APIs|DuplexWeb|Feedfetcher|Mediapartners)-Google|Google(?:AdSenseInfeed|AssociationService|bot|Other|Prober|Producer)|Google.*/\+/web/snippet' name: 'Googlebot' category: 'Search bot' - url: 'http://www.google.com/bot.html' + url: 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers' producer: name: 'Google Inc.' - url: 'http://www.google.com' + url: 'https://www.google.com/' - regex: '^Google$' name: 'Googlebot' category: 'Search bot' - url: 'http://www.google.com/bot.html' + url: 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers' producer: name: 'Google Inc.' - url: 'http://www.google.com' + url: 'https://www.google.com/' - regex: 'heritrix' name: 'Heritrix' @@ -1638,10 +1638,18 @@ - regex: 'vkShare; ' name: 'VK Share Button' category: 'Crawler' - url: 'http://vk.com/dev/Share' + url: 'https://dev.vk.com/en/widgets/share' producer: name: 'VK' - url: 'http://vk.com/' + url: 'https://vk.com/' + +- regex: 'VKRobot' + name: 'VK Robot' + category: 'Crawler' + url: 'https://dev.vk.com/en/' + producer: + name: 'VK' + url: 'https://vk.com/' - regex: 'VSMCrawler' name: 'Visual Site Mapper Crawler' @@ -2043,9 +2051,6 @@ - regex: 'RSSRadio \(Push Notification Scanner;support@dorada\.co\.uk\)' name: 'RSSRadio Bot' -- regex: '(A6-Indexer|nuhk|TsolCrawler|Yammybot|Openbot|Gulper Web Bot|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr.com|tweetedtimes.com|TrendsmapResolver|teoma|blitzbot|oegp|furlbot|http%20client|polybot|htdig|mogimogi|larbin|scrubby|searchsight|seekbot|semanticdiscovery|snappy|vortex(?!(?: Build|Plus))|zeal(?!ot)|fast-webcrawler|converacrawler|dataparksearch|findlinks|BrowserMob|HttpMonitor|ThumbShotsBot|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|RackspaceBot|robots|SeopultContentAnalyzer|7Siters|centuryb.o.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|My User Agent|cortex|CF-UC User Agent|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|custom_user_agent|Test Certificate Info|iplabel|Magellan|CustomUserAgent|TheSafex?Internetx?Search|kirkland-signature)' - name: 'Generic Bot' - - regex: '^sentry' name: 'Sentry Bot' producer: @@ -3631,6 +3636,25 @@ name: 'ООО «Регистратор доменных имен РЕГ.РУ»' url: 'https://statonline.ru/' +- regex: 'Spawning-AI' + name: 'Spawning AI' + category: 'Crawler' + url: 'https://spawning.ai/' + producer: + name: 'Spawning, Inc' + url: 'https://spawning.ai/' + +- regex: 'domain research project' + name: 'Domain Research Project' + category: 'Crawler' + url: 'https://trentwil.es/domains.html' + producer: + name: 'Trent Wiles' + url: 'https://trentwil.es/' + # Generic detections +- regex: '(A6-Indexer|nuhk|TsolCrawler|Yammybot|Openbot|Gulper Web Bot|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr.com|tweetedtimes.com|TrendsmapResolver|teoma|blitzbot|oegp|furlbot|http%20client|polybot|htdig|mogimogi|larbin|scrubby|searchsight|seekbot|semanticdiscovery|snappy|vortex(?!(?: Build|Plus))|zeal(?!ot)|fast-webcrawler|converacrawler|dataparksearch|findlinks|BrowserMob|HttpMonitor|ThumbShotsBot|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|RackspaceBot|robots|SeopultContentAnalyzer|7Siters|centuryb.o.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|My User Agent|cortex|CF-UC User Agent|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|custom_user_agent|Test Certificate Info|iplabel|Magellan|CustomUserAgent|TheSafex?Internetx?Search|kirkland-signature|research|project(?!or))' + name: 'Generic Bot' + - regex: '[a-z0-9\-_]*((?