From f2a8bf14a0a90d4becc54e67a0011c46ec78d6d0 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 10:04:47 +0800 Subject: [PATCH 01/13] feat(plugin): Add new plugin bot-restriction for bot spider restrction --- apisix/plugins/bot-restriction.lua | 179 ++++++ conf/config-default.yaml | 1 + docs/en/latest/plugins/bot-restriction.md | 130 ++++ docs/zh/latest/plugins/bot-restriction.md | 123 ++++ t/plugin/bot-restriction.t | 699 ++++++++++++++++++++++ 5 files changed, 1132 insertions(+) create mode 100644 apisix/plugins/bot-restriction.lua create mode 100644 docs/en/latest/plugins/bot-restriction.md create mode 100644 docs/zh/latest/plugins/bot-restriction.md create mode 100644 t/plugin/bot-restriction.t diff --git a/apisix/plugins/bot-restriction.lua b/apisix/plugins/bot-restriction.lua new file mode 100644 index 000000000000..42e890bba434 --- /dev/null +++ b/apisix/plugins/bot-restriction.lua @@ -0,0 +1,179 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- +local ipairs = ipairs +local core = require("apisix.core") +local stringx = require('pl.stringx') +local str_strip = stringx.strip +local re_find = ngx.re.find + +local MATCH_NONE = 0 +local MATCH_ALLOW = 1 +local MATCH_DENY = 2 +local MATCH_BOT = 3 + +local lrucache_useragent = core.lrucache.new({ ttl = 300, count = 1024 }) + +local schema = { + type = "object", + properties = { + message = { + type = "string", + minLength = 1, + maxLength = 1024, + default = "Not allowed" + }, + whitelist = { + type = "array", + minItems = 1 + }, + blacklist = { + type = "array", + minItems = 1 + }, + }, + additionalProperties = false, +} + +local plugin_name = "bot-restriction" + +local _M = { + version = 0.1, + priority = 3000, + name = plugin_name, + schema = schema, +} + +-- List taken from https://github.com/ua-parser/uap-core/blob/master/regexes.yaml +local well_known_bots = { + [[(Pingdom\.com_bot_version_)(\d+)\.(\d+)]], + [[(facebookexternalhit)/(\d+)\.(\d+)]], + [[Google.{0,50}/\+/web/snippet]], + [[(NewRelicPinger)/(\d+)\.(\d+)], + [[\b(Boto3?|JetS3t|aws-(?:cli|sdk-(?:cpp|go|java|nodejs|ruby2?|dotnet-(?:\d{1,2}|c]] + .. [[ore)))|s3fs)/(\d+)\.(\d+)(?:\.(\d+)|)]], + [[ PTST/\d+(?:\.)?\d+$]], + [[/((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|T]] + .. [[ailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)?]], + [[\b(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goo]] + .. [[dzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|OgScrper|PathDefender|Peew|Po]] + .. [[stPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sp]] + .. [[roose|wminer)/(\d+)(?:\.(\d+)|)(?:\.(\d+)|)]], + [[(MSIE) (\d+)\.(\d+)([a-z]\d|[a-z]|);.{0,200} MSIECrawler]], + [[(Google-HTTP-Java-Client|Apache-HttpClient|Go-http-client|scalaj-http|http%20cli]] + .. [[ent|Python-urllib|HttpMonitor|TLSProber|WinHTTP|JNLP|okhttp|aihttp|reqwest|axios]] + .. [[|unirest-(?:java|python|ruby|nodejs|php|net))(?:[ /](\d+)(?:\.(\d+)|)(?:\.(\d+)|]] + .. [[)|)]], + [[(CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Jo]] + .. [[b Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends C]] + .. [[rawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|14]] + .. [[70\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot]] + .. [[-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|arch]] + .. [[iver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bing]] + .. [[bot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardRead]] + .. [[er Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYe]] + .. [[ti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo ]] + .. [[HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConver]] + .. [[a|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngin]] + .. [[e|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Go]] + .. [[oglebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|Hid]] + .. [[denMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobil]] + .. [[e|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchi]] + .. [[ve|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Lingu]] + .. [[ee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_]] + .. [[bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:]] + .. [[-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGato]] + .. [[r[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|]] + .. [[PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobo]] + .. [[t|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scoot]] + .. [[er|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimpleP]] + .. [[ie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy]] + .. [[|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tin]] + .. [[y Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voy]] + .. [[ager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|]] + .. [[Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(]] + .. [[?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|Yottaa]] + .. [[Monitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)]] + .. [[(?:\.(\d+)(?:\.(\d+)|)|)|)]], + [[(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexe]] + .. [[r|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)]] + .. [[|)|)]], + [[(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexe]] + .. [[r|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50})) (\d+)(?:\.(\d+)(?:\.(\d+)|)|]] + .. [[)]], + [[((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]cr]] + .. [[ape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+]] + .. [[)|)|)]], +} + +local function match_user_agent(user_agent, conf) + user_agent = str_strip(user_agent) + if conf.whitelist then + for _, rule in ipairs(conf.whitelist) do + if re_find(user_agent, rule, "jo") then + return MATCH_ALLOW + end + end + end + + if conf.blacklist then + for _, rule in ipairs(conf.blacklist) do + if re_find(user_agent, rule, "jo") then + return MATCH_DENY + end + end + end + + for _, rule in ipairs(well_known_bots) do + if re_find(user_agent, rule, "jo") then + return MATCH_BOT + end + end + + return MATCH_NONE +end + +function _M.check_schema(conf) + local ok, err = core.schema.check(schema, conf) + + if not ok then + return false, err + end + + return true +end + +function _M.access(conf, _) + local headers = ngx.req.get_headers() + local user_agent = headers["user-agent"] + -- ignore multiple instances of request headers + if type(user_agent) == "table" then + return + end + if not user_agent then + return + end + local match, err = lrucache_useragent(user_agent, conf, match_user_agent, user_agent, conf) + if err then + return + end + + if match > MATCH_ALLOW then + return 403, { message = conf.message } + end +end + +return _M diff --git a/conf/config-default.yaml b/conf/config-default.yaml index eedf77febdfa..9cbe1963deef 100644 --- a/conf/config-default.yaml +++ b/conf/config-default.yaml @@ -252,6 +252,7 @@ plugins: # plugin list (sorted by priority) - batch-requests # priority: 4010 - cors # priority: 4000 - ip-restriction # priority: 3000 + - bot-restriction # priority: 3000 - referer-restriction # priority: 2990 - uri-blocker # priority: 2900 - request-validation # priority: 2800 diff --git a/docs/en/latest/plugins/bot-restriction.md b/docs/en/latest/plugins/bot-restriction.md new file mode 100644 index 000000000000..c133015beba0 --- /dev/null +++ b/docs/en/latest/plugins/bot-restriction.md @@ -0,0 +1,130 @@ +--- +title: bot-restriction +--- + + + +## Summary + +- [**Name**](#name) +- [**Attributes**](#attributes) +- [**How To Enable**](#how-to-enable) +- [**Test Plugin**](#test-plugin) +- [**Disable Plugin**](#disable-plugin) + +## Name + +The `bot-restriction` can restrict access to a Service or a Route by either +`whitelisting` or `blacklisting` or `most well-known` bots. + +## Attributes + +| Name | Type | Requirement | Default | Valid | Description | +| --------- | ------------- | ----------- | ------- | ----- | ---------------------------------------- | +| whitelist | array[string] | optional | | | List of User-Agent of whitelist. | +| blacklist | array[string] | optional | | | List of User-Agent of blacklist. | +| message | string | optional | Not allowed. | [1, 1024] | Message of deny reason. | + +Any of `whitelist` or `blacklist` can be optional, and can work together in this order: +whitelist->blacklist->default well-known User-Agent list. + +The message can be user-defined. + +## How To Enable + +Creates a route or service object, and enable plugin `bot-restriction`. + +```shell +curl http://127.0.0.1:9080/apisix/admin/routes/1 -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1' -X PUT -d ' +{ + "uri": "/index.html", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "whitelist": [ + "my-bot1", + "(Baiduspider)/(\\d+)\\.(\\d+)" + ], + "blacklist": [ + "my-bot2", + "(Twitterspider)/(\\d+)\\.(\\d+)" + ] + } + } +}' +``` + +Default returns `{"message":"Not allowed"}` when rejected. If you want to use a custom message, you can configure it in the plugin section. + +```json +"plugins": { + "bot-restriction": { + "blacklist": [ + "my-bot2", + "(Twitterspider)/(\\d+)\\.(\\d+)" + ], + "message": "Do you want to do something bad?" + } +} +``` + +## Test Plugin + +Requests from normal User-Agent: + +```shell +$ curl http://127.0.0.1:9080/index.html -i +HTTP/1.1 200 OK +... +``` + +Requests from bot User-Agent: + +```shell +$ curl http://127.0.0.1:9080/index.html --header 'User-Agent: Twitterspider/2.0' +HTTP/1.1 403 Forbidden +``` + +## Disable Plugin + +When you want to disable the `bot-restriction` plugin, it is very simple, +you can delete the corresponding json configuration in the plugin configuration, +no need to restart the service, it will take effect immediately: + +```shell +$ curl http://127.0.0.1:2379/v2/keys/apisix/routes/1 -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1' -X PUT -d value=' +{ + "uri": "/index.html", + "plugins": {}, + "upstream": { + "type": "roundrobin", + "nodes": { + "39.97.63.215:80": 1 + } + } +}' +``` + +The `bot-restriction` plugin has been disabled now. It works for other plugins. \ No newline at end of file diff --git a/docs/zh/latest/plugins/bot-restriction.md b/docs/zh/latest/plugins/bot-restriction.md new file mode 100644 index 000000000000..b0b2c601f365 --- /dev/null +++ b/docs/zh/latest/plugins/bot-restriction.md @@ -0,0 +1,123 @@ +--- +title: bot-restriction +--- + + + +## 目录 + +- [**名字**](#名字) +- [**属性**](#属性) +- [**如何启用**](#如何启用) +- [**测试插件**](#测试插件) +- [**禁用插件**](#禁用插件) + +## 名字 + +`bot-restriction` 可以通过以下方式限制对服务或接口的访问,可以将指定 User-Agent 列入白名单或黑名单,同时此插件也将对常见的爬虫 UA 进行了检查。 + +## 属性 + +| 参数名 | 类型 | 可选项 | 默认值 | 有效值 | 描述 | +| --------- | ------------- | ------ | ------ | ------ | -------------------------------- | +| whitelist | array[string] | 可选 | | | 加入白名单的 User-Agent | +| blacklist | array[string] | 可选 | | | 加入黑名单的 User-Agent | +| message | string | 可选 | Not allowed. | [1, 1024] | 在未允许的 User-Agent 访问的情况下返回的信息 | + +白名单或黑名单可以同时启用,此插件对 User-Agent 的检查先后顺序依次如下:白名单、黑名单、内置的场景爬虫 User-Agent。 `message`可以由用户自定义。 + +## 如何启用 + +下面是一个示例,在指定的 route 上开启了 `bot-restriction` 插件: + +```shell +curl http://127.0.0.1:9080/apisix/admin/routes/1 -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1' -X PUT -d ' +{ + "uri": "/index.html", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "whitelist": [ + "my-bot1", + "(Baiduspider)/(\\d+)\\.(\\d+)" + ], + "blacklist": [ + "my-bot2", + "(Twitterspider)/(\\d+)\\.(\\d+)" + ] + } + } +}' +``` + +当未允许的 User-Agent 访问时,默认返回`{"message":"Not allowed"}`。如果你想使用自定义的`message`,可以在插件部分进行配置: + +```json +"plugins": { + "bot-restriction": { + "blacklist": [ + "my-bot2", + "(Twitterspider)/(\\d+)\\.(\\d+)" + ], + "message": "Do you want to do something bad?" + } +} +``` + +## 测试插件 + +通过正常的 UA 访问: + +```shell +$ curl http://127.0.0.1:9080/index.html --header 'User-Agent: YourApp/2.0.0' +HTTP/1.1 200 OK +``` + +通过爬虫 User-Agent 访问: + +```shell +$ curl http://127.0.0.1:9080/index.html --header 'User-Agent: Twitterspider/2.0' +HTTP/1.1 403 Forbidden +``` + +## 禁用插件 + +当你想去掉 `bot-restriction` 插件的时候,很简单,在插件的配置中把对应的 json 配置删除即可,无须重启服务,即刻生效: + +```shell +$ curl http://127.0.0.1:2379/v2/keys/apisix/routes/1 -H 'X-API-KEY: edd1c9f034335f136f87ad84b625c8f1' -X PUT -d value=' +{ + "uri": "/index.html", + "plugins": {}, + "upstream": { + "type": "roundrobin", + "nodes": { + "39.97.63.215:80": 1 + } + } +}' +``` + +现在就已移除 `bot-restriction` 插件,其它插件的开启和移除也类似。 diff --git a/t/plugin/bot-restriction.t b/t/plugin/bot-restriction.t new file mode 100644 index 000000000000..118998f2d3ea --- /dev/null +++ b/t/plugin/bot-restriction.t @@ -0,0 +1,699 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; + +repeat_each(1); +no_long_string(); +no_shuffle(); +no_root_location(); + +add_block_preprocessor(sub { + my ($block) = @_; + + if (!$block->request) { + $block->set_value("request", "GET /t"); + } + + if (!$block->no_error_log && !$block->error_log) { + $block->set_value("no_error_log", "[error]\n[alert]"); + } +}); + +run_tests; + +__DATA__ + +=== TEST 1: empty conf +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.bot-restriction") + local ok, err = plugin.check_schema({}) + if not ok then + ngx.say(err) + end + + ngx.say(require("toolkit.json").encode(conf)) + } + } +--- error_code: 200 + + + +=== TEST 2: set whitelist, blacklist and user-defined message +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.bot-restriction") + local conf = { + whitelist = { + "my-bot1", + "my-bot2" + }, + blacklist = { + "my-bot1", + "my-bot2" + }, + message = "User-Agent Forbidden", + } + local ok, err = plugin.check_schema(conf) + if not ok then + ngx.say(err) + end + + ngx.say(require("toolkit.json").encode(conf)) + } + } +--- response_body +{"blacklist":["my-bot1","my-bot2"],"message":"User-Agent Forbidden","whitelist":["my-bot1","my-bot2"]} + + + +=== TEST 3: whitelist not array +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.bot-restriction") + local conf = { + whitelist = "my-bot1", + } + local ok, err = plugin.check_schema(conf) + if not ok then + ngx.say(err) + end + + ngx.say("done") + } + } +--- response_body +property "whitelist" validation failed: wrong type: expected array, got string +done + + + +=== TEST 4: blacklist not array +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.bot-restriction") + local conf = { + blacklist = 100, + } + local ok, err = plugin.check_schema(conf) + if not ok then + ngx.say(err) + end + + ngx.say("done") + } + } +--- response_body +property "blacklist" validation failed: wrong type: expected array, got number +done + + + +=== TEST 5: message not string +--- config + location /t { + content_by_lua_block { + local plugin = require("apisix.plugins.bot-restriction") + local conf = { + message = 100, + } + local ok, err = plugin.check_schema(conf) + if not ok then + ngx.say(err) + end + + ngx.say("done") + } + } +--- response_body +property "message" validation failed: wrong type: expected string, got number +done + + + +=== TEST 6: set blacklist + +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "blacklist": [ + "my-bot1", + "(Baiduspider)/(\\d+)\\.(\\d+)" + ] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 7: hit route and user-agent in blacklist + +--- request +GET /hello +--- more_headers +User-Agent:my-bot1 +--- error_code: 403 + + + +=== TEST 8: hit route and user-agent in blacklist with multiple + +--- request +GET /hello +--- more_headers +User-Agent:my-bot1 +User-Agent:my-bot1 +--- error_code: 200 + + + +=== TEST 9: hit route and user-agent match blacklist regex + +--- request +GET /hello +--- more_headers +User-Agent:Baiduspider/3.0 +--- error_code: 403 + + + +=== TEST 10: hit route and user-agent not in blacklist + +--- request +GET /hello +--- more_headers +User-Agent:foo/bar +--- error_code: 200 + + + +=== TEST 11: set whitelist + +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "whitelist": [ + "my-bot1", + "(Baiduspider)/(\\d+)\\.(\\d+)" + ] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 12: hit route and user-agent in whitelist + +--- request +GET /hello +--- more_headers +User-Agent:my-bot1 +--- error_code: 200 + + + +=== TEST 13: hit route and user-agent match whitelist regex + +--- request +GET /hello +--- more_headers +User-Agent:Baiduspider/3.0 +--- error_code: 200 + + + +=== TEST 14: hit route and user-agent not in whitelist + +--- request +GET /hello +--- more_headers +User-Agent:foo/bar +--- error_code: 200 + + + +=== TEST 15: set rules to default +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 16: hit route and user-agent in default list + +--- request +GET /hello +--- more_headers +User-Agent:Twitterbot/1.0 +--- error_code: 403 + + + +=== TEST 17: set config: user-agent in both whitelist and blacklist +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "whitelist": [ + "foo/bar", + "(Baiduspider)/(\\d+)\\.(\\d+)" + ], + "blacklist": [ + "foo/bar", + "(Baiduspider)/(\\d+)\\.(\\d+)" + ] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 18: hit route and user-agent in both whitelist and blacklist, part 1 + +--- request +GET /hello +--- more_headers +User-Agent:foo/bar +--- error_code: 200 + + + +=== TEST 19: hit route and user-agent in both whitelist and blacklist, part 2 + +--- request +GET /hello +--- more_headers +User-Agent:Baiduspider/1.0 +--- error_code: 200 + + + +=== TEST 20: set config: user-agent in both whitelist and default deny list +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "whitelist": [ + "(Baiduspider)/(\\d+)\\.(\\d+)" + ] + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 21: hit route and user-agent in both whitelist and default deny list + +--- request +GET /hello +--- more_headers +User-Agent:Baiduspider/1.0 +--- error_code: 200 + + + +=== TEST 22: message that do not reach the minimum range +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "message": "" + } + } + }]] + ) + + ngx.say(body) + } + } +--- response_body_like eval +qr/string too short, expected at least 1, got 0/ + + + +=== TEST 23: exceeds the maximum limit of message +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local json = require("toolkit.json") + + local data = { + uri = "/hello", + upstream = { + type = "roundrobin", + nodes = { + ["127.0.0.1:1980"] = 1, + } + }, + plugins = { + ["bot-restriction"] = { + message = ("-1Aa#"):rep(205) + } + } + } + + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + json.encode(data) + ) + + ngx.say(body) + } + } +--- response_body_like eval +qr/string too long, expected at most 1024, got 1025/ + + + +=== TEST 24: set custom message +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + "message": "Do you want to do something bad?" + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } + +--- response_body +passed + + + +=== TEST 25: test custom message +--- request +GET /hello +--- more_headers +User-Agent:Twitterbot/1.0 +--- error_code: 403 +--- response_body +{"message":"Do you want to do something bad?"} + + + +=== TEST 26: test remove bot-restriction part 1 +--- config + location /enable { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + "bot-restriction": { + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } + + location /disable { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /enable +--- error_code: 200 + + + +=== TEST 27: test remove bot-restriction part 2 +--- request +GET /hello +--- more_headers +User-Agent:Twitterbot/1.0 +--- error_code: 403 + + + +=== TEST 28: test remove bot-restriction part 3, remove plugin +--- config + location /disable { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + }, + "plugins": { + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- request +GET /disable +--- error_code: 200 + + + +=== TEST 29: test remove bot-restriction part 4, check bot User-Agent +--- request +GET /hello +--- more_headers +User-Agent:Twitterbot/1.0 +--- response_body +hello world + + + +=== TEST 30: set disable=true +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "plugins": { + "bot-restriction": { + "blacklist": [ + "foo" + ], + "disable": true + } + } + }]] + ) + + if code >= 300 then + ngx.status = code + end + ngx.say(body) + } + } +--- response_body +passed \ No newline at end of file From 830b621e45e5be65232389f752d2bd800d24bc09 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 13:04:13 +0800 Subject: [PATCH 02/13] fix doc --- docs/en/latest/plugins/bot-restriction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/latest/plugins/bot-restriction.md b/docs/en/latest/plugins/bot-restriction.md index c133015beba0..7b6fb3bae6ef 100644 --- a/docs/en/latest/plugins/bot-restriction.md +++ b/docs/en/latest/plugins/bot-restriction.md @@ -127,4 +127,4 @@ $ curl http://127.0.0.1:2379/v2/keys/apisix/routes/1 -H 'X-API-KEY: edd1c9f03433 }' ``` -The `bot-restriction` plugin has been disabled now. It works for other plugins. \ No newline at end of file +The `bot-restriction` plugin has been disabled now. It works for other plugins. From bfe2d7ccc53d6b91b876bb64b3ccc8fa707bc20c Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 13:07:21 +0800 Subject: [PATCH 03/13] fix doc --- docs/en/latest/config.json | 1 + docs/zh/latest/config.json | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json index 2f088781efff..d1b123a48014 100644 --- a/docs/en/latest/config.json +++ b/docs/en/latest/config.json @@ -73,6 +73,7 @@ "plugins/cors", "plugins/uri-blocker", "plugins/ip-restriction", + "plugins/bot-restriction", "plugins/referer-restriction", "plugins/consumer-restriction" ] diff --git a/docs/zh/latest/config.json b/docs/zh/latest/config.json index 9cda9ea311b5..5a84f7a9b024 100644 --- a/docs/zh/latest/config.json +++ b/docs/zh/latest/config.json @@ -71,6 +71,7 @@ "plugins/cors", "plugins/uri-blocker", "plugins/ip-restriction", + "plugins/bot-restriction", "plugins/referer-restriction", "plugins/consumer-restriction" ] From cb6d5d800ed43e0a8fb191531a9cde4ff6e82619 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 14:26:43 +0800 Subject: [PATCH 04/13] Fix lint --- apisix/plugins/bot-restriction.lua | 14 +++++++------- t/admin/plugins.t | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/apisix/plugins/bot-restriction.lua b/apisix/plugins/bot-restriction.lua index 42e890bba434..249181b69f9d 100644 --- a/apisix/plugins/bot-restriction.lua +++ b/apisix/plugins/bot-restriction.lua @@ -52,7 +52,7 @@ local plugin_name = "bot-restriction" local _M = { version = 0.1, - priority = 3000, + priority = 2999, name = plugin_name, schema = schema, } @@ -156,14 +156,14 @@ function _M.check_schema(conf) return true end -function _M.access(conf, _) - local headers = ngx.req.get_headers() - local user_agent = headers["user-agent"] - -- ignore multiple instances of request headers - if type(user_agent) == "table" then +function _M.access(conf, ctx) + local user_agent = core.request.header(ctx, "User-Agent") + + if not user_agent then return end - if not user_agent then + -- ignore multiple instances of request headers + if type(user_agent) == "table" then return end local match, err = lrucache_useragent(user_agent, conf, match_user_agent, user_agent, conf) diff --git a/t/admin/plugins.t b/t/admin/plugins.t index 61553a35e343..cd279e635a94 100644 --- a/t/admin/plugins.t +++ b/t/admin/plugins.t @@ -40,7 +40,7 @@ __DATA__ --- request GET /apisix/admin/plugins/list --- response_body_like eval -qr/\["client-control","ext-plugin-pre-req","zipkin","request-id","fault-injection","serverless-pre-function","batch-requests","cors","ip-restriction","referer-restriction","uri-blocker","request-validation","openid-connect","wolf-rbac","hmac-auth","basic-auth","jwt-auth","key-auth","consumer-restriction","authz-keycloak","proxy-mirror","proxy-cache","proxy-rewrite","api-breaker","limit-conn","limit-count","limit-req","server-info","traffic-split","redirect","response-rewrite","grpc-transcode","prometheus","echo","http-logger","sls-logger","tcp-logger","kafka-logger","syslog","udp-logger","example-plugin","serverless-post-function","ext-plugin-post-req"\]/ +qr/\["client-control","ext-plugin-pre-req","zipkin","request-id","fault-injection","serverless-pre-function","batch-requests","cors","ip-restriction","bot-restriction","referer-restriction","uri-blocker","request-validation","openid-connect","wolf-rbac","hmac-auth","basic-auth","jwt-auth","key-auth","consumer-restriction","authz-keycloak","proxy-mirror","proxy-cache","proxy-rewrite","api-breaker","limit-conn","limit-count","limit-req","server-info","traffic-split","redirect","response-rewrite","grpc-transcode","prometheus","echo","http-logger","sls-logger","tcp-logger","kafka-logger","syslog","udp-logger","example-plugin","serverless-post-function","ext-plugin-post-req"\]/ --- no_error_log [error] From 8be944ea13a3bdd34e7f271dc30ec8ae3998bfb1 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 14:30:11 +0800 Subject: [PATCH 05/13] fix lint --- apisix/plugins/bot-restriction.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/apisix/plugins/bot-restriction.lua b/apisix/plugins/bot-restriction.lua index 249181b69f9d..6b8d63719e94 100644 --- a/apisix/plugins/bot-restriction.lua +++ b/apisix/plugins/bot-restriction.lua @@ -17,6 +17,7 @@ local ipairs = ipairs local core = require("apisix.core") local stringx = require('pl.stringx') +local type = type local str_strip = stringx.strip local re_find = ngx.re.find From dc331cdf2f731f5546cedb6a47d29a5729b7ae37 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 14:37:15 +0800 Subject: [PATCH 06/13] Fix lint --- t/plugin/bot-restriction.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/plugin/bot-restriction.t b/t/plugin/bot-restriction.t index 118998f2d3ea..cfae1fbf517f 100644 --- a/t/plugin/bot-restriction.t +++ b/t/plugin/bot-restriction.t @@ -696,4 +696,4 @@ hello world } } --- response_body -passed \ No newline at end of file +passed From 7ea7c8729e348ee9ec3682535955989606196a50 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 16:52:44 +0800 Subject: [PATCH 07/13] Fix Review --- apisix/plugins/bot-restriction.lua | 5 +---- conf/config-default.yaml | 2 +- t/plugin/bot-restriction.t | 26 -------------------------- 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/apisix/plugins/bot-restriction.lua b/apisix/plugins/bot-restriction.lua index 6b8d63719e94..b01374bc915c 100644 --- a/apisix/plugins/bot-restriction.lua +++ b/apisix/plugins/bot-restriction.lua @@ -167,10 +167,7 @@ function _M.access(conf, ctx) if type(user_agent) == "table" then return end - local match, err = lrucache_useragent(user_agent, conf, match_user_agent, user_agent, conf) - if err then - return - end + local match = lrucache_useragent(user_agent, conf, match_user_agent, user_agent, conf) if match > MATCH_ALLOW then return 403, { message = conf.message } diff --git a/conf/config-default.yaml b/conf/config-default.yaml index 9cbe1963deef..19969dc78e4a 100644 --- a/conf/config-default.yaml +++ b/conf/config-default.yaml @@ -252,7 +252,7 @@ plugins: # plugin list (sorted by priority) - batch-requests # priority: 4010 - cors # priority: 4000 - ip-restriction # priority: 3000 - - bot-restriction # priority: 3000 + - bot-restriction # priority: 2999 - referer-restriction # priority: 2990 - uri-blocker # priority: 2900 - request-validation # priority: 2800 diff --git a/t/plugin/bot-restriction.t b/t/plugin/bot-restriction.t index cfae1fbf517f..0f6712cc9a27 100644 --- a/t/plugin/bot-restriction.t +++ b/t/plugin/bot-restriction.t @@ -151,7 +151,6 @@ done === TEST 6: set blacklist - --- config location /t { content_by_lua_block { @@ -587,31 +586,6 @@ User-Agent:Twitterbot/1.0 ngx.say(body) } } - - location /disable { - content_by_lua_block { - local t = require("lib.test_admin").test - local code, body = t('/apisix/admin/routes/1', - ngx.HTTP_PUT, - [[{ - "uri": "/hello", - "upstream": { - "type": "roundrobin", - "nodes": { - "127.0.0.1:1980": 1 - } - }, - "plugins": { - } - }]] - ) - - if code >= 300 then - ngx.status = code - end - ngx.say(body) - } - } --- request GET /enable --- error_code: 200 From 0c6dcb2d5bde3d5aad8cf22b1df74e21626d6e61 Mon Sep 17 00:00:00 2001 From: "Arthur.Zhang" Date: Mon, 12 Jul 2021 18:49:00 +0800 Subject: [PATCH 08/13] rename plugin to ua-restriction --- apisix/plugins/bot-restriction.lua | 177 -------------- apisix/plugins/ua-restriction.lua | 121 ++++++++++ conf/config-default.yaml | 2 +- docs/en/latest/config.json | 2 +- .../{bot-restriction.md => ua-restriction.md} | 28 +-- docs/zh/latest/config.json | 2 +- .../{bot-restriction.md => ua-restriction.md} | 26 +- t/admin/plugins.t | 2 +- .../{bot-restriction.t => ua-restriction.t} | 223 ++++++------------ 9 files changed, 218 insertions(+), 365 deletions(-) delete mode 100644 apisix/plugins/bot-restriction.lua create mode 100644 apisix/plugins/ua-restriction.lua rename docs/en/latest/plugins/{bot-restriction.md => ua-restriction.md} (77%) rename docs/zh/latest/plugins/{bot-restriction.md => ua-restriction.md} (74%) rename t/plugin/{bot-restriction.t => ua-restriction.t} (70%) diff --git a/apisix/plugins/bot-restriction.lua b/apisix/plugins/bot-restriction.lua deleted file mode 100644 index b01374bc915c..000000000000 --- a/apisix/plugins/bot-restriction.lua +++ /dev/null @@ -1,177 +0,0 @@ --- --- Licensed to the Apache Software Foundation (ASF) under one or more --- contributor license agreements. See the NOTICE file distributed with --- this work for additional information regarding copyright ownership. --- The ASF licenses this file to You under the Apache License, Version 2.0 --- (the "License"); you may not use this file except in compliance with --- the License. You may obtain a copy of the License at --- --- http://www.apache.org/licenses/LICENSE-2.0 --- --- Unless required by applicable law or agreed to in writing, software --- distributed under the License is distributed on an "AS IS" BASIS, --- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language governing permissions and --- limitations under the License. --- -local ipairs = ipairs -local core = require("apisix.core") -local stringx = require('pl.stringx') -local type = type -local str_strip = stringx.strip -local re_find = ngx.re.find - -local MATCH_NONE = 0 -local MATCH_ALLOW = 1 -local MATCH_DENY = 2 -local MATCH_BOT = 3 - -local lrucache_useragent = core.lrucache.new({ ttl = 300, count = 1024 }) - -local schema = { - type = "object", - properties = { - message = { - type = "string", - minLength = 1, - maxLength = 1024, - default = "Not allowed" - }, - whitelist = { - type = "array", - minItems = 1 - }, - blacklist = { - type = "array", - minItems = 1 - }, - }, - additionalProperties = false, -} - -local plugin_name = "bot-restriction" - -local _M = { - version = 0.1, - priority = 2999, - name = plugin_name, - schema = schema, -} - --- List taken from https://github.com/ua-parser/uap-core/blob/master/regexes.yaml -local well_known_bots = { - [[(Pingdom\.com_bot_version_)(\d+)\.(\d+)]], - [[(facebookexternalhit)/(\d+)\.(\d+)]], - [[Google.{0,50}/\+/web/snippet]], - [[(NewRelicPinger)/(\d+)\.(\d+)], - [[\b(Boto3?|JetS3t|aws-(?:cli|sdk-(?:cpp|go|java|nodejs|ruby2?|dotnet-(?:\d{1,2}|c]] - .. [[ore)))|s3fs)/(\d+)\.(\d+)(?:\.(\d+)|)]], - [[ PTST/\d+(?:\.)?\d+$]], - [[/((?:Ant-)?Nutch|[A-z]+[Bb]ot|[A-z]+[Ss]pider|Axtaris|fetchurl|Isara|ShopSalad|T]] - .. [[ailsweep)[ \-](\d+)(?:\.(\d+)(?:\.(\d+))?)?]], - [[\b(008|Altresium|Argus|BaiduMobaider|BoardReader|DNSGroup|DataparkSearch|EDI|Goo]] - .. [[dzer|Grub|INGRID|Infohelfer|LinkedInBot|LOOQ|Nutch|OgScrper|PathDefender|Peew|Po]] - .. [[stPost|Steeler|Twitterbot|VSE|WebCrunch|WebZIP|Y!J-BR[A-Z]|YahooSeeker|envolk|sp]] - .. [[roose|wminer)/(\d+)(?:\.(\d+)|)(?:\.(\d+)|)]], - [[(MSIE) (\d+)\.(\d+)([a-z]\d|[a-z]|);.{0,200} MSIECrawler]], - [[(Google-HTTP-Java-Client|Apache-HttpClient|Go-http-client|scalaj-http|http%20cli]] - .. [[ent|Python-urllib|HttpMonitor|TLSProber|WinHTTP|JNLP|okhttp|aihttp|reqwest|axios]] - .. [[|unirest-(?:java|python|ruby|nodejs|php|net))(?:[ /](\d+)(?:\.(\d+)|)(?:\.(\d+)|]] - .. [[)|)]], - [[(CSimpleSpider|Cityreview Robot|CrawlDaddy|CrawlFire|Finderbots|Index crawler|Jo]] - .. [[b Roboter|KiwiStatus Spider|Lijit Crawler|QuerySeekerSpider|ScollSpider|Trends C]] - .. [[rawler|USyd-NLP-Spider|SiteCat Webbot|BotName\/\$BotVersion|123metaspider-Bot|14]] - .. [[70\.net crawler|50\.nu|8bo Crawler Bot|Aboundex|Accoona-[A-z]{1,30}-Agent|AdsBot]] - .. [[-Google(?:-[a-z]{1,30}|)|altavista|AppEngine-Google|archive.{0,30}\.org_bot|arch]] - .. [[iver|Ask Jeeves|[Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bing]] - .. [[bot|BingPreview|blitzbot|BlogBridge|Bloglovin|BoardReader Blog Indexer|BoardRead]] - .. [[er Favicon Fetcher|boitho.com-dc|BotSeer|BUbiNG|\b\w{0,30}favicon\w{0,30}\b|\bYe]] - .. [[ti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Checklinks|clumboot|Comodo ]] - .. [[HTTP\(S\) Crawler|Comodo-Webinspector-Crawler|ConveraCrawler|CRAWL-E|CrawlConver]] - .. [[a|Daumoa(?:-feedfetcher|)|Feed Seeker Bot|Feedbin|findlinks|Flamingo_SearchEngin]] - .. [[e|FollowSite Bot|furlbot|Genieo|gigabot|GomezAgent|gonzo1|(?:[a-zA-Z]{1,30}-|)Go]] - .. [[oglebot(?:-[a-zA-Z]{1,30}|)|Google SketchUp|grub-client|gsa-crawler|heritrix|Hid]] - .. [[denMarket|holmes|HooWWWer|htdig|ia_archiver|ICC-Crawler|Icarus6j|ichiro(?:/mobil]] - .. [[e|)|IconSurf|IlTrovatore(?:-Setaccio|)|InfuzApp|Innovazion Crawler|InternetArchi]] - .. [[ve|IP2[a-z]{1,30}Bot|jbot\b|KaloogaBot|Kraken|Kurzor|larbin|LEIA|LesnikBot|Lingu]] - .. [[ee Bot|LinkAider|LinkedInBot|Lite Bot|Llaut|lycos|Mail\.RU_Bot|masscan|masidani_]] - .. [[bot|Mediapartners-Google|Microsoft .{0,30} Bot|mogimogi|mozDex|MJ12bot|msnbot(?:]] - .. [[-media {0,2}|)|msrbot|Mtps Feed Aggregation System|netresearch|Netvibes|NewsGato]] - .. [[r[^/]{0,30}|^NING|Nutch[^/]{0,30}|Nymesis|ObjectsSearch|OgScrper|Orbiter|OOZBOT|]] - .. [[PagePeeker|PagesInventory|PaxleFramework|Peeplo Screenshot Bot|PlantyNet_WebRobo]] - .. [[t|Pompos|Qwantify|Read%20Later|Reaper|RedCarpet|Retreiver|Riddler|Rival IQ|scoot]] - .. [[er|Scrapy|Scrubby|searchsight|seekbot|semanticdiscovery|SemrushBot|Simpy|SimpleP]] - .. [[ie|SEOstats|SimpleRSS|SiteCon|Slackbot-LinkExpanding|Slack-ImgProxy|Slurp|snappy]] - .. [[|Speedy Spider|Squrl Java|Stringer|TheUsefulbot|ThumbShotsBot|Thumbshots\.ru|Tin]] - .. [[y Tiny RSS|Twitterbot|WhatsApp|URL2PNG|Vagabondo|VoilaBot|^vortex|Votay bot|^voy]] - .. [[ager|WASALive.Bot|Web-sniffer|WebThumb|WeSEE:[A-z]{1,30}|WhatWeb|WIRE|WordPress|]] - .. [[Wotbox|www\.almaden\.ibm\.com|Xenu(?:.s|) Link Sleuth|Xerka [A-z]{1,30}Bot|yacy(]] - .. [[?:bot|)|YahooSeeker|Yahoo! Slurp|Yandex\w{1,30}|YodaoBot(?:-[A-z]{1,30}|)|Yottaa]] - .. [[Monitor|Yowedo|^Zao|^Zao-Crawler|ZeBot_www\.ze\.bz|ZooShot|ZyBorg)(?:[ /]v?(\d+)]] - .. [[(?:\.(\d+)(?:\.(\d+)|)|)|)]], - [[(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexe]] - .. [[r|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50}))[/ ](\d+)(?:\.(\d+)(?:\.(\d+)]] - .. [[|)|)]], - [[(?:\/[A-Za-z0-9\.]+|) {0,5}([A-Za-z0-9 \-_\!\[\]:]{0,50}(?:[Aa]rchiver|[Ii]ndexe]] - .. [[r|[Ss]craper|[Bb]ot|[Ss]pider|[Cc]rawl[a-z]{0,50})) (\d+)(?:\.(\d+)(?:\.(\d+)|)|]] - .. [[)]], - [[((?:[A-z0-9]{1,50}|[A-z\-]{1,50} ?|)(?: the |)(?:[Ss][Pp][Ii][Dd][Ee][Rr]|[Ss]cr]] - .. [[ape|[Cc][Rr][Aa][Ww][Ll])[A-z0-9]{0,50})(?:(?:[ /]| v)(\d+)(?:\.(\d+)|)(?:\.(\d+]] - .. [[)|)|)]], -} - -local function match_user_agent(user_agent, conf) - user_agent = str_strip(user_agent) - if conf.whitelist then - for _, rule in ipairs(conf.whitelist) do - if re_find(user_agent, rule, "jo") then - return MATCH_ALLOW - end - end - end - - if conf.blacklist then - for _, rule in ipairs(conf.blacklist) do - if re_find(user_agent, rule, "jo") then - return MATCH_DENY - end - end - end - - for _, rule in ipairs(well_known_bots) do - if re_find(user_agent, rule, "jo") then - return MATCH_BOT - end - end - - return MATCH_NONE -end - -function _M.check_schema(conf) - local ok, err = core.schema.check(schema, conf) - - if not ok then - return false, err - end - - return true -end - -function _M.access(conf, ctx) - local user_agent = core.request.header(ctx, "User-Agent") - - if not user_agent then - return - end - -- ignore multiple instances of request headers - if type(user_agent) == "table" then - return - end - local match = lrucache_useragent(user_agent, conf, match_user_agent, user_agent, conf) - - if match > MATCH_ALLOW then - return 403, { message = conf.message } - end -end - -return _M diff --git a/apisix/plugins/ua-restriction.lua b/apisix/plugins/ua-restriction.lua new file mode 100644 index 000000000000..eaba09ff7375 --- /dev/null +++ b/apisix/plugins/ua-restriction.lua @@ -0,0 +1,121 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- +local ipairs = ipairs +local core = require("apisix.core") +local stringx = require('pl.stringx') +local type = type +local str_strip = stringx.strip +local re_find = ngx.re.find + +local MATCH_NONE = 0 +local MATCH_ALLOW = 1 +local MATCH_DENY = 2 + +local lrucache_useragent = core.lrucache.new({ ttl = 300, count = 4096 }) + +local schema = { + type = "object", + properties = { + message = { + type = "string", + minLength = 1, + maxLength = 1024, + default = "Not allowed" + }, + allowlist = { + type = "array", + minItems = 1 + }, + denylist = { + type = "array", + minItems = 1 + }, + }, + anyOf = { + {required = {"allowlist"}}, + {required = {"denylist"}}, + }, + minProperties = 1, + additionalProperties = false, +} + +local plugin_name = "ua-restriction" + +local _M = { + version = 0.1, + priority = 2999, + name = plugin_name, + schema = schema, +} + +local function match_user_agent(user_agent, conf) + user_agent = str_strip(user_agent) + if conf.allowlist then + for _, rule in ipairs(conf.allowlist) do + if re_find(user_agent, rule, "jo") then + return MATCH_ALLOW + end + end + end + + if conf.denylist then + for _, rule in ipairs(conf.denylist) do + if re_find(user_agent, rule, "jo") then + return MATCH_DENY + end + end + end + + return MATCH_NONE +end + +function _M.check_schema(conf) + local ok, err = core.schema.check(schema, conf) + + if not ok then + return false, err + end + + return true +end + +function _M.access(conf, ctx) + local user_agent = core.request.header(ctx, "User-Agent") + + if not user_agent then + return + end + local match = MATCH_NONE + if type(user_agent) == "table" then + for _, v in ipairs(user_agent) do + if type(v) == "string" then + match = lrucache_useragent(v, conf, match_user_agent, v, conf) + if match > MATCH_ALLOW then + break + end + end + end + else + match = lrucache_useragent(user_agent, conf, match_user_agent, user_agent, conf) + end + + if match > MATCH_ALLOW then + return 403, { message = conf.message } + end +end + +return _M diff --git a/conf/config-default.yaml b/conf/config-default.yaml index 19969dc78e4a..a103928b965f 100644 --- a/conf/config-default.yaml +++ b/conf/config-default.yaml @@ -252,7 +252,7 @@ plugins: # plugin list (sorted by priority) - batch-requests # priority: 4010 - cors # priority: 4000 - ip-restriction # priority: 3000 - - bot-restriction # priority: 2999 + - ua-restriction # priority: 2999 - referer-restriction # priority: 2990 - uri-blocker # priority: 2900 - request-validation # priority: 2800 diff --git a/docs/en/latest/config.json b/docs/en/latest/config.json index d1b123a48014..72565a642304 100644 --- a/docs/en/latest/config.json +++ b/docs/en/latest/config.json @@ -73,7 +73,7 @@ "plugins/cors", "plugins/uri-blocker", "plugins/ip-restriction", - "plugins/bot-restriction", + "plugins/ua-restriction", "plugins/referer-restriction", "plugins/consumer-restriction" ] diff --git a/docs/en/latest/plugins/bot-restriction.md b/docs/en/latest/plugins/ua-restriction.md similarity index 77% rename from docs/en/latest/plugins/bot-restriction.md rename to docs/en/latest/plugins/ua-restriction.md index 7b6fb3bae6ef..3bf335e70e3f 100644 --- a/docs/en/latest/plugins/bot-restriction.md +++ b/docs/en/latest/plugins/ua-restriction.md @@ -1,5 +1,5 @@ --- -title: bot-restriction +title: ua-restriction ---