From a5abb11c9fa7656c15804c4f9266acbbfa0c9c4e Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:13:41 +0200 Subject: [PATCH 01/33] Fix user agent --- Tests/fixtures/bots.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 6cb88bafc9..a607465747 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -2475,7 +2475,7 @@ name: Quora url: http://www.quora.com - - user_agent: 'Mozilla/5.0 (compatible; Qwantify/2.2w; +https://www.qwant.com/)/*' + user_agent: Mozilla/5.0 (compatible; Qwantify/2.2w; +https://www.qwant.com/) bot: name: Qwantify category: Crawler From 31b781e28004d30e9bfa2861eb98b7b33e63cf45 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:14:10 +0200 Subject: [PATCH 02/33] Add another user agent for Qwantify --- Tests/fixtures/bots.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index a607465747..fa79352d61 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -2483,6 +2483,15 @@ producer: name: Qwant Corporation url: https://www.qwant.com/ +- + user_agent: Mozilla/5.0 (compatible; Qwantify-prod34997/1.0; +https://help.qwant.com/bot/) + bot: + name: Qwantify + category: Crawler + url: https://www.qwant.com/ + producer: + name: Qwant Corporation + url: https://www.qwant.com/ - user_agent: ROI Hunter; https://api-dev.roihunter.com bot: From c18b7db534d13f19dedf8068208dd0d7ea9b7150 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:18:56 +0200 Subject: [PATCH 03/33] Add test for PagePeeker --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index fa79352d61..994fa87121 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7812,3 +7812,12 @@ producer: name: Meins und Vogel GmbH url: https://muv.com/ +- + user_agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36 (compatible; PagePeeker/3.0; +https://pagepeeker.com/robots/) + bot: + name: PagePeeker + category: Crawler + url: https://pagepeeker.com/robots/ + producer: + name: PAGEPEEKER SRL + url: https://pagepeeker.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index a93559a6d5..24890e60ef 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -2187,6 +2187,11 @@ - regex: 'PagePeeker' name: 'PagePeeker' + category: 'Crawler' + url: 'https://pagepeeker.com/robots/' + producer: + name: 'PAGEPEEKER SRL' + url: 'https://pagepeeker.com/' - regex: 'WebThumbnail' name: 'WebThumbnail' From 7c17088fc4df15a210526f6169861b93978d3ac3 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:21:33 +0200 Subject: [PATCH 04/33] Add another test for SemrushBot --- Tests/fixtures/bots.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 994fa87121..f843b1c4aa 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7821,3 +7821,12 @@ producer: name: PAGEPEEKER SRL url: https://pagepeeker.com/ +- + user_agent: Mozilla/5.0 (compatible; SemrushBot-SWA/0.1; +http://www.semrush.com/bot.html) + bot: + name: SemrushBot + category: Crawler + url: https://www.semrush.com/bot/ + producer: + name: Semrush Inc. + url: https://www.semrush.com/ From 823b2f95128f82f552d795a867d20da2a3832c88 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:24:04 +0200 Subject: [PATCH 05/33] Improves DuckDuckBot --- Tests/fixtures/bots.yml | 8 ++++---- regexes/bots.yml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index f843b1c4aa..0cb2115c60 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -831,18 +831,18 @@ - user_agent: DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html) bot: - name: DuckDuckGo Bot + name: DuckDuckBot category: Search bot - url: https://duckduckgo.com/duckduckbot + url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/ producer: name: DuckDuckGo url: https://duckduckgo.com/ - user_agent: Mozilla/5.0 (compatible; DuckDuckGo-Favicons-Bot/1.0; +http://duckduckgo.com) bot: - name: DuckDuckGo Bot + name: DuckDuckBot category: Search bot - url: https://duckduckgo.com/duckduckbot + url: https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/ producer: name: DuckDuckGo url: https://duckduckgo.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 24890e60ef..6c1d13c240 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -513,9 +513,9 @@ url: 'http://moz.com/' - regex: 'DuckDuck(?:Go-Favicons-)?Bot' - name: 'DuckDuckGo Bot' + name: 'DuckDuckBot' category: 'Search bot' - url: 'https://duckduckgo.com/duckduckbot' + url: 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot/' producer: name: 'DuckDuckGo' url: 'https://duckduckgo.com/' From 39e978e5609a6562574c7e31ca1889041a4af7e2 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:25:22 +0200 Subject: [PATCH 06/33] Adds detection for DuckAssistBot --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 0cb2115c60..1315edc6c8 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -846,6 +846,15 @@ producer: name: DuckDuckGo url: https://duckduckgo.com/ +- + user_agent: DuckAssistBot/1.1; (+http://duckduckgo.com/duckassistbot.html) + bot: + name: DuckAssistBot + category: Search bot + url: https://duckduckgo.com/duckduckgo-help-pages/results/duckassistbot/ + producer: + name: DuckDuckGo + url: https://duckduckgo.com/ - user_agent: EMail Exractor bot: diff --git a/regexes/bots.yml b/regexes/bots.yml index 6c1d13c240..8cc7a34b5d 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -520,6 +520,14 @@ name: 'DuckDuckGo' url: 'https://duckduckgo.com/' +- regex: 'DuckAssistBot' + name: 'DuckAssistBot' + category: 'Search bot' + url: 'https://duckduckgo.com/duckduckgo-help-pages/results/duckassistbot/' + producer: + name: 'DuckDuckGo' + url: 'https://duckduckgo.com/' + - regex: 'EasouSpider' name: 'Easou Spider' category: 'Search bot' From f1fe175a04b741bc47adfe9c7877a9eba5e28a2f Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:28:17 +0200 Subject: [PATCH 07/33] Adds detection for RedekenBot --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 1315edc6c8..6ed2a4c0f8 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7839,3 +7839,12 @@ producer: name: Semrush Inc. url: https://www.semrush.com/ +- + user_agent: Mozilla/5.0 (compatible; RedekenBot/0.1; +https://www.redeken.com/bot/) + bot: + name: RedekenBot + category: Crawler + url: https://www.redeken.com/en/help/bot.html + producer: + name: Redeken + url: https://www.redeken.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 8cc7a34b5d..bf9a777901 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4546,6 +4546,14 @@ name: 'Meins und Vogel GmbH' url: 'https://muv.com/' +- regex: 'RedekenBot' + name: 'RedekenBot' + category: 'Crawler' + url: 'https://www.redeken.com/en/help/bot.html' + producer: + name: 'Redeken' + url: 'https://www.redeken.com/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From fbe57d6d71f060ad902ddec75c8af1d3393546a7 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:48:42 +0200 Subject: [PATCH 08/33] Adds detection for semaltbot --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 6ed2a4c0f8..048b3187b7 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7848,3 +7848,12 @@ producer: name: Redeken url: https://www.redeken.com/ +- + user_agent: semaltbot/0.1 (+http://semalt.net) + bot: + name: semaltbot + category: Crawler + url: https://semalt.net/ + producer: + name: Semalt LP + url: https://semalt.net/ diff --git a/regexes/bots.yml b/regexes/bots.yml index bf9a777901..1081289606 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4554,6 +4554,14 @@ name: 'Redeken' url: 'https://www.redeken.com/' +- regex: 'semaltbot' + name: 'semaltbot' + category: 'Crawler' + url: 'https://semalt.net/' + producer: + name: 'Semalt LP' + url: 'https://semalt.net/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From 2e285f0123c4ca96d5c365b42087b3f49e96882c Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:50:42 +0200 Subject: [PATCH 09/33] Adds detection for MakeMerryBot --- Tests/fixtures/bots.yml | 6 ++++++ regexes/bots.yml | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 048b3187b7..1f5c1221e5 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7857,3 +7857,9 @@ producer: name: Semalt LP url: https://semalt.net/ +- + user_agent: Mozilla/5.0 (compatible; MakeMerryBot/1.0; +https://makemerry.app/bots) + bot: + name: MakeMerryBot + category: Crawler + url: https://makemerry.app/bots diff --git a/regexes/bots.yml b/regexes/bots.yml index 1081289606..6c2627720b 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4562,6 +4562,11 @@ name: 'Semalt LP' url: 'https://semalt.net/' +- regex: 'MakeMerryBot' + name: 'MakeMerryBot' + category: 'Crawler' + url: 'https://makemerry.app/bots' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From 1d31b7b5916031ad47149d4b334cb928364bec66 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:53:23 +0200 Subject: [PATCH 10/33] Adds detection for Timpibot --- Tests/fixtures/bots.yml | 18 ++++++++++++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 1f5c1221e5..4e544d0e71 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7863,3 +7863,21 @@ name: MakeMerryBot category: Crawler url: https://makemerry.app/bots +- + user_agent: Timpibot/0.9 (+http://www.timpi.io) + bot: + name: Timpibot + category: Crawler + url: https://timpi.io/ + producer: + name: Timpi Inc. + url: https://timpi.io/ +- + user_agent: Mozilla/5.0 (compatible; Timpibot/0.8; +http://www.timpi.io) + bot: + name: Timpibot + category: Crawler + url: https://timpi.io/ + producer: + name: Timpi Inc. + url: https://timpi.io/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 6c2627720b..249b0e0243 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4567,6 +4567,14 @@ category: 'Crawler' url: 'https://makemerry.app/bots' +- regex: 'Timpibot' + name: 'Timpibot' + category: 'Crawler' + url: 'https://timpi.io/' + producer: + name: 'Timpi Inc.' + url: 'https://timpi.io/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From 6ee0dcaeb0100801a03fb6bbf9ddf157024245bc Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:56:15 +0200 Subject: [PATCH 11/33] Add generic bot test --- Tests/fixtures/bots.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 4e544d0e71..989adf538f 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7881,3 +7881,7 @@ producer: name: Timpi Inc. url: https://timpi.io/ +- + user_agent: 'Tublm.com/Bot/fubpdfdotcom/Bot/Bot -❤️- +https://tublm.com/game/2048_merge' + bot: + name: Generic Bot From 2e7dcc82792a47add47b906aaf8d7f6ab722c146 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 03:58:02 +0200 Subject: [PATCH 12/33] Adds detection for ValidBot --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 989adf538f..3aeb781903 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7885,3 +7885,12 @@ user_agent: 'Tublm.com/Bot/fubpdfdotcom/Bot/Bot -❤️- +https://tublm.com/game/2048_merge' bot: name: Generic Bot +- + user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15 (compatible; Validbot; +https://www.validbot.com) + bot: + name: ValidBot + category: Crawler + url: https://www.validbot.com/ + producer: + name: Jake Olefsky LLC + url: https://www.validbot.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 249b0e0243..08336b43e1 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4575,6 +4575,14 @@ name: 'Timpi Inc.' url: 'https://timpi.io/' +- regex: 'Validbot' + name: 'ValidBot' + category: 'Crawler' + url: 'https://www.validbot.com/' + producer: + name: 'Jake Olefsky LLC' + url: 'https://www.validbot.com/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From 51416c9530c7d8d0804ebe870f0519515ee513fd Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 04:04:35 +0200 Subject: [PATCH 13/33] Adds detection for NameProtect --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 3aeb781903..ae83c51050 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7894,3 +7894,12 @@ producer: name: Jake Olefsky LLC url: https://www.validbot.com/ +- + user_agent: NPBot + bot: + name: NameProtect + category: Crawler + url: https://www.cscglobal.com/cscglobal/home/ + producer: + name: NameProtect, Inc. + url: https://www.cscglobal.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 08336b43e1..4f5f7c9d58 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4583,6 +4583,14 @@ name: 'Jake Olefsky LLC' url: 'https://www.validbot.com/' +- regex: 'NPBot' + name: 'NameProtect' + category: 'Crawler' + url: 'https://www.cscglobal.com/cscglobal/home/' + producer: + name: 'NameProtect, Inc.' + url: 'https://www.cscglobal.com/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From a58e3242b5dc1ee2f30283acea9f11d98b381838 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 04:08:31 +0200 Subject: [PATCH 14/33] Change name --- Tests/fixtures/bots.yml | 2 +- regexes/bots.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index ae83c51050..b2ce2e2860 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7897,7 +7897,7 @@ - user_agent: NPBot bot: - name: NameProtect + name: NameProtectBot category: Crawler url: https://www.cscglobal.com/cscglobal/home/ producer: diff --git a/regexes/bots.yml b/regexes/bots.yml index 4f5f7c9d58..82f387ad66 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4584,7 +4584,7 @@ url: 'https://www.validbot.com/' - regex: 'NPBot' - name: 'NameProtect' + name: 'NameProtectBot' category: 'Crawler' url: 'https://www.cscglobal.com/cscglobal/home/' producer: From 11c297664a85838d47032391b1fc89ea74bea1a5 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 04:31:46 +0200 Subject: [PATCH 15/33] Adds detection for CLASSLA-web --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index b2ce2e2860..19fe0d8892 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -5081,6 +5081,15 @@ producer: name: Jožef Stefan Institute url: https://www.ijs.si/ijsw/JSI +- + user_agent: Mozilla/5.0 (compatible; CLASSLA-web; +https://www.clarin.si/info/classla-web-crawler/) + bot: + name: CLASSLA-web + category: Crawler + url: https://www.clarin.si/info/classla-web-crawler/ + producer: + name: Jožef Stefan Institute + url: https://www.ijs.si/ijsw/JSI - user_agent: "Electronic Frontier Foundation's Do Not Track Verifier (for questions or concerns email dnt-policy@eff.org)" bot: diff --git a/regexes/bots.yml b/regexes/bots.yml index 82f387ad66..464b23cd1e 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -3191,6 +3191,14 @@ name: 'Jožef Stefan Institute' url: 'https://www.ijs.si/ijsw/JSI' +- regex: 'CLASSLA' + name: 'CLASSLA-web' + category: 'Crawler' + url: 'https://www.clarin.si/info/classla-web-crawler/' + producer: + name: 'Jožef Stefan Institute' + url: 'https://www.ijs.si/ijsw/JSI' + - regex: 'dnt-policy@eff\.org' name: 'EFF Do Not Track Verifier' category: 'Crawler' From ec1f769037abc02c4f246728b501b28411198d35 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 04:32:28 +0200 Subject: [PATCH 16/33] Add generic bot test --- Tests/fixtures/bots.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 19fe0d8892..2225ba31a6 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7912,3 +7912,7 @@ producer: name: NameProtect, Inc. url: https://www.cscglobal.com/ +- + user_agent: Mozilla/5.0 (compatible; CuriousCatgirl Research; +https://curiouscatgirl.cynthia.dev) + bot: + name: Generic Bot From 0e28a358cf58fc8c383bf1120a0b50371878a45d Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 22:02:41 +0200 Subject: [PATCH 17/33] Improves detection for generic bots --- Tests/fixtures/bots.yml | 4 ++++ regexes/bots.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 2225ba31a6..c03a928fbc 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7916,3 +7916,7 @@ user_agent: Mozilla/5.0 (compatible; CuriousCatgirl Research; +https://curiouscatgirl.cynthia.dev) bot: name: Generic Bot +- + user_agent: xx032_bo9vs83_2a + bot: + name: Generic Bot diff --git a/regexes/bots.yml b/regexes/bots.yml index 464b23cd1e..8e7b4dafbd 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4600,7 +4600,7 @@ url: 'https://www.cscglobal.com/' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections From 0dd9487a38a84a3787266df1426884fcd1c1e72a Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 22:14:45 +0200 Subject: [PATCH 18/33] Move heritrix at the bottom --- regexes/bots.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/regexes/bots.yml b/regexes/bots.yml index 8e7b4dafbd..26b82ff97c 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -837,14 +837,6 @@ name: 'Google Inc.' url: 'https://www.google.com/' -- regex: 'heritrix' - name: 'Heritrix' - category: 'Crawler' - url: 'https://webarchive.jira.com/wiki/display/Heritrix/Heritrix' - producer: - name: 'The Internet Archive' - url: 'https://archive.org' - - regex: 'HubSpot ' name: 'HubSpot' category: 'Crawler' @@ -4599,6 +4591,14 @@ name: 'NameProtect, Inc.' url: 'https://www.cscglobal.com/' +- regex: 'heritrix' + name: 'Heritrix' + category: 'Crawler' + url: 'https://webarchive.jira.com/wiki/display/Heritrix/Heritrix' + producer: + name: 'The Internet Archive' + url: 'https://archive.org' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From d077d83ea153ded5075f116f6e12682cdd03d2aa Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 22:18:24 +0200 Subject: [PATCH 19/33] Fix Arquivo.pt test --- Tests/fixtures/bots.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index c03a928fbc..03bd439b83 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -6732,12 +6732,12 @@ - user_agent: Arquivo-web-crawler (compatible; heritrix/3.4.0-20200304 +https://arquivo.pt/faq-crawling) bot: - name: Heritrix + name: Arquivo.pt category: Crawler - url: https://webarchive.jira.com/wiki/display/Heritrix/Heritrix + url: https://sobre.arquivo.pt/en/help/crawling-and-archiving-web-content/ producer: - name: The Internet Archive - url: https://archive.org + name: FCT|FCCN + url: https://www.fct.pt/ - user_agent: Arquivo-web-crawler (compatible; brozzler/1.5 +https://arquivo.pt/faq-crawling) bot: From 1322c70082f0e0375e884e0d3d2c6ad52521376c Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 22:22:01 +0200 Subject: [PATCH 20/33] Adds detection for Domain Codex --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 03bd439b83..0adf2f5eea 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7920,3 +7920,12 @@ user_agent: xx032_bo9vs83_2a bot: name: Generic Bot +- + user_agent: Mozilla/5.0 (compatible; heritrix/3.3.0-SNAPSHOT-20160721-2308 +https://www.domaincodex.com) + bot: + name: Domain Codex + category: Crawler + url: https://www.domaincodex.com/ + producer: + name: Erie Data Systems, LLC + url: https://www.eriedatasys.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 26b82ff97c..46dc12c793 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4591,6 +4591,14 @@ name: 'NameProtect, Inc.' url: 'https://www.cscglobal.com/' +- regex: 'domaincodex\.com' + name: 'Domain Codex' + category: 'Crawler' + url: 'https://www.domaincodex.com/' + producer: + name: 'Erie Data Systems, LLC' + url: 'https://www.eriedatasys.com/' + - regex: 'heritrix' name: 'Heritrix' category: 'Crawler' From 25dbfd2e531e7b0397c94af1cf540c0ddc1ed08c Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 22:29:31 +0200 Subject: [PATCH 21/33] Adds detection for Swisscows Favicons --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 0adf2f5eea..68f7391c75 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7929,3 +7929,12 @@ producer: name: Erie Data Systems, LLC url: https://www.eriedatasys.com/ +- + user_agent: Swisscows Favicons + bot: + name: Swisscows Favicons + category: Crawler + url: https://swisscows.com/ + producer: + name: Swisscows AG + url: https://swisscows.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 46dc12c793..edf06aeb8f 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4599,6 +4599,14 @@ name: 'Erie Data Systems, LLC' url: 'https://www.eriedatasys.com/' +- regex: 'Swisscows Favicons' + name: 'Swisscows Favicons' + category: 'Crawler' + url: 'https://swisscows.com/' + producer: + name: 'Swisscows AG' + url: 'https://swisscows.com/' + - regex: 'heritrix' name: 'Heritrix' category: 'Crawler' From 7d4dd68ead338cce6a52ab1ba39181da7a3473fa Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 23:02:11 +0200 Subject: [PATCH 22/33] Adds detection for leak.info --- Tests/fixtures/bots.yml | 6 ++++++ regexes/bots.yml | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 68f7391c75..730d61080d 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7938,3 +7938,9 @@ producer: name: Swisscows AG url: https://swisscows.com/ +- + user_agent: Mozilla/4.0 (compatible; fluid/0.0; +http://www.leak.info/bot.html) + bot: + name: leak.info + category: Crawler + url: http://www.leak.info/ diff --git a/regexes/bots.yml b/regexes/bots.yml index edf06aeb8f..7e4fa42fa3 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4607,6 +4607,11 @@ name: 'Swisscows AG' url: 'https://swisscows.com/' +- regex: 'leak\.info' + name: 'leak.info' + category: 'Crawler' + url: 'http://www.leak.info/' + - regex: 'heritrix' name: 'Heritrix' category: 'Crawler' From 43da177d4b06dea67449aca3bb562e36ba37fe95 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 23:04:35 +0200 Subject: [PATCH 23/33] Adds detection for Workona --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 730d61080d..a2bbfbbd52 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7944,3 +7944,12 @@ name: leak.info category: Crawler url: http://www.leak.info/ +- + user_agent: workona-favicon-service/1.0.0 + bot: + name: Workona + category: Crawler + url: https://workona.com/ + producer: + name: Workona, Inc. + url: https://workona.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 7e4fa42fa3..a7ed0aa5a8 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4612,6 +4612,14 @@ category: 'Crawler' url: 'http://www.leak.info/' +- regex: 'workona' + name: 'Workona' + category: 'Crawler' + url: 'https://workona.com/' + producer: + name: 'Workona, Inc.' + url: 'https://workona.com/' + - regex: 'heritrix' name: 'Heritrix' category: 'Crawler' From 6429687a32e555f49c76f278d7482b4f05ab5bc3 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 23:12:02 +0200 Subject: [PATCH 24/33] Adds detection for Bloglines --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index a2bbfbbd52..4ac9ea361d 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7953,3 +7953,12 @@ producer: name: Workona, Inc. url: https://workona.com/ +- + user_agent: Bloglines/3.1 (http://www.bloglines.com) + bot: + name: Bloglines + category: Crawler + url: https://web.archive.org/web/20140309033202/http://www.bloglines.com/ + producer: + name: Reply!, Inc. + url: https://www.reply.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index a7ed0aa5a8..62e2a25a43 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4620,6 +4620,14 @@ name: 'Workona, Inc.' url: 'https://workona.com/' +- regex: 'Bloglines' + name: 'Bloglines' + category: 'Crawler' + url: 'https://web.archive.org/web/20140309033202/http://www.bloglines.com/' + producer: + name: 'Reply!, Inc.' + url: 'https://www.reply.com/' + - regex: 'heritrix' name: 'Heritrix' category: 'Crawler' From 88ba0162a2fed14edb8dfee1bc0a0748601c4733 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Sun, 14 Jul 2024 23:16:56 +0200 Subject: [PATCH 25/33] Improves detection for generic bots --- Tests/fixtures/bots.yml | 4 ++++ regexes/bots.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 4ac9ea361d..2824530411 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7962,3 +7962,7 @@ producer: name: Reply!, Inc. url: https://www.reply.com/ +- + user_agent: 'shadowforce.io - sslshed/0.1' + bot: + name: Generic Bot diff --git a/regexes/bots.yml b/regexes/bots.yml index 62e2a25a43..1fc73e1f84 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4637,7 +4637,7 @@ url: 'https://archive.org' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections From 344c04216c616e2c0be961d8674c5d77ea503f8a Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Wed, 17 Jul 2024 09:12:42 +0200 Subject: [PATCH 26/33] Adds detection for Marginalia --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 2824530411..7a8f28f6df 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7966,3 +7966,12 @@ user_agent: 'shadowforce.io - sslshed/0.1' bot: name: Generic Bot +- + user_agent: search.marginalia.nu + bot: + name: Marginalia + category: Crawler + url: https://www.marginalia.nu/marginalia-search/for-webmasters/ + producer: + name: Marginalia + url: https://www.marginalia.nu/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 1fc73e1f84..eaf51296c6 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4636,6 +4636,14 @@ name: 'The Internet Archive' url: 'https://archive.org' +- regex: 'search\.marginalia\.nu' + name: 'Marginalia' + category: 'Crawler' + url: 'https://www.marginalia.nu/marginalia-search/for-webmasters/' + producer: + name: 'Marginalia' + url: 'https://www.marginalia.nu/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From bf4fb69b20a4e0964451c65d69ae0d6f69c5cf98 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 18 Jul 2024 07:00:13 +0200 Subject: [PATCH 27/33] Adds detection for VU Server Health Scanner --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 7a8f28f6df..c20c5424f1 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7975,3 +7975,12 @@ producer: name: Marginalia url: https://www.marginalia.nu/ +- + user_agent: Mozilla/5.0 (compatible;vu-server-health-scanner/1.0;https://130.37.198.75/index.html) + bot: + name: VU Server Health Scanner + category: Security Checker + url: https://130.37.198.75/index.html + producer: + name: VU Amsterdam + url: https://vu.nl/en diff --git a/regexes/bots.yml b/regexes/bots.yml index eaf51296c6..6e62c330cc 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4644,6 +4644,14 @@ name: 'Marginalia' url: 'https://www.marginalia.nu/' +- regex: 'vu-server-health-scanner/[\d.]+' + name: 'VU Server Health Scanner' + category: 'Security Checker' + url: 'https://130.37.198.75/index.html' + producer: + name: 'VU Amsterdam' + url: 'https://vu.nl/en' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' From f9a3abef33c9a7e33c0a5cabf1b0ff2d90546b00 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 18 Jul 2024 17:59:00 +0200 Subject: [PATCH 28/33] Improves detection for generic bots --- Tests/fixtures/bots.yml | 4 ++++ regexes/bots.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index c20c5424f1..65986d870f 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7984,3 +7984,7 @@ producer: name: VU Amsterdam url: https://vu.nl/en +- + user_agent: Searcherxweb + bot: + name: Generic Bot diff --git a/regexes/bots.yml b/regexes/bots.yml index 6e62c330cc..dd9cca5df9 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4653,7 +4653,7 @@ url: 'https://vu.nl/en' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections From fa2db24e40f24d25286da46b54ac4c7992a2eba3 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 18 Jul 2024 18:11:17 +0200 Subject: [PATCH 29/33] Improves detection for generic bots --- Tests/fixtures/bots.yml | 4 ++++ regexes/bots.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 65986d870f..8ec85d008a 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7988,3 +7988,7 @@ user_agent: Searcherxweb bot: name: Generic Bot +- + user_agent: Mozilla/5.0 (platform; rv:geckoversion) Gecko/geckotrail Firefox/firefoxversion + bot: + name: Generic Bot diff --git a/regexes/bots.yml b/regexes/bots.yml index dd9cca5df9..b7d52250c2 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4653,7 +4653,7 @@ url: 'https://vu.nl/en' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections From 08c891a9fefe89894c1b01fa95be0e199dfa0751 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 18 Jul 2024 18:29:46 +0200 Subject: [PATCH 30/33] Improves detection for generic bots --- Tests/fixtures/bots.yml | 8 ++++++++ regexes/bots.yml | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 8ec85d008a..22f8174a24 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -7992,3 +7992,11 @@ user_agent: Mozilla/5.0 (platform; rv:geckoversion) Gecko/geckotrail Firefox/firefoxversion bot: name: Generic Bot +- + user_agent: Report Runner + bot: + name: Generic Bot +- + user_agent: Node.js + bot: + name: Generic Bot diff --git a/regexes/bots.yml b/regexes/bots.yml index b7d52250c2..12907169fa 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4653,7 +4653,7 @@ url: 'https://vu.nl/en' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections From c448dec9ec5afe405c01eb397bbf4eff73ea7d94 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 18 Jul 2024 18:35:47 +0200 Subject: [PATCH 31/33] Adds detection for Functionize --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 22f8174a24..93f2dc2977 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -8000,3 +8000,12 @@ user_agent: Node.js bot: name: Generic Bot +- + user_agent: Mozilla/5.0 (X11; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Functionize + bot: + name: Functionize + category: Crawler + url: https://www.functionize.com/ + producer: + name: Functionize, Inc. + url: https://www.functionize.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 12907169fa..04b1581999 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4652,6 +4652,14 @@ name: 'VU Amsterdam' url: 'https://vu.nl/en' +- regex: 'Functionize' + name: 'Functionize' + category: 'Crawler' + url: 'https://www.functionize.com/' + producer: + name: 'Functionize, Inc.' + url: 'https://www.functionize.com/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot' From c50af03adc349ceaf603707707d32412fde4f0bd Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Thu, 18 Jul 2024 18:46:39 +0200 Subject: [PATCH 32/33] Remove from apps --- Tests/Parser/Client/fixtures/mobile_app.yml | 6 ------ regexes/client/mobile_apps.yml | 5 ----- 2 files changed, 11 deletions(-) diff --git a/Tests/Parser/Client/fixtures/mobile_app.yml b/Tests/Parser/Client/fixtures/mobile_app.yml index 609d74ddfd..5e75d98432 100644 --- a/Tests/Parser/Client/fixtures/mobile_app.yml +++ b/Tests/Parser/Client/fixtures/mobile_app.yml @@ -2057,12 +2057,6 @@ type: mobile app name: Teams version: 24004.1304.2655.7488 -- - user_agent: Report Runner - client: - type: mobile app - name: Report Runner - version: "" - user_agent: Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Zalo iOS/448 ZaloTheme/light ZaloLanguage/en client: diff --git a/regexes/client/mobile_apps.yml b/regexes/client/mobile_apps.yml index bc813cb053..aa2882159e 100644 --- a/regexes/client/mobile_apps.yml +++ b/regexes/client/mobile_apps.yml @@ -5,11 +5,6 @@ # @license http://www.gnu.org/licenses/lgpl.html LGPL v3 or later ############### -# Report Runner (https://reportrunner.com/) -- regex: 'Report Runner' - name: 'Report Runner' - version: '' - # Ameba (https://ameblo.jp/ | https://play.google.com/store/apps/details?id=jp.ameba) - regex: 'jpameblo;(\d+\.[\.\d]+)' name: 'Ameba' From 703e81b176cb395620243d2b38eb82aa2253aa55 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Fri, 19 Jul 2024 01:47:46 +0200 Subject: [PATCH 33/33] Adds detection for Prerender --- Tests/fixtures/bots.yml | 27 +++++++++++++++++++++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 35 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 93f2dc2977..174d437fb1 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -8009,3 +8009,30 @@ producer: name: Functionize, Inc. url: https://www.functionize.com/ +- + user_agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/W.X.Y.Z Safari/537.36 Prerender (+https://github.com/prerender/prerender) + bot: + name: Prerender + category: Crawler + url: https://docs.prerender.io/docs/33-overview-of-prerender-crawlers + producer: + name: saas.group Inc. + url: https://saas.group/ +- + user_agent: Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 Prerender (+https://github.com/prerender/prerender) + bot: + name: Prerender + category: Crawler + url: https://docs.prerender.io/docs/33-overview-of-prerender-crawlers + producer: + name: saas.group Inc. + url: https://saas.group/ +- + user_agent: Prerender (+https://github.com/prerender/prerender) + bot: + name: Prerender + category: Crawler + url: https://docs.prerender.io/docs/33-overview-of-prerender-crawlers + producer: + name: saas.group Inc. + url: https://saas.group/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 04b1581999..955c20dbcb 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4660,6 +4660,14 @@ name: 'Functionize, Inc.' url: 'https://www.functionize.com/' +- regex: 'Prerender' + name: 'Prerender' + category: 'Crawler' + url: 'https://docs.prerender.io/docs/33-overview-of-prerender-crawlers' + producer: + name: 'saas.group Inc.' + url: 'https://saas.group/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot'