forked from Mirrors/privacore-open-source-search-engine
Check sequence is: blacklist (urlblacklist.txt) whitelist (urlwhitelist.txt) wanted_check.so default->crawl
59 lines
1.4 KiB
Plaintext
59 lines
1.4 KiB
Plaintext
# this file accepts multiple criteria types
|
|
# blocks url from being inserted into spiderdb
|
|
# one line criteria per line
|
|
#
|
|
# types:
|
|
# - domain
|
|
# - host
|
|
# - path
|
|
# - regex (pcre)
|
|
# - tld
|
|
#
|
|
# domain example
|
|
# ==============
|
|
# block all subdomain of example.com domain
|
|
# blocks: www.example.com, sub.example.com, example.com
|
|
domain example.com
|
|
|
|
# block all of example.com except empty subdomain & www
|
|
# blocks: sub.example.com
|
|
# allows: example.com, www.example.com
|
|
domain example.com allow=,www
|
|
|
|
# host example
|
|
# ============
|
|
# block specific host
|
|
# blocks: www.example.com
|
|
# allows: sub.example.com
|
|
host www.example.com
|
|
|
|
# block specific host with path (prefix)
|
|
# blocks: www.example.com/path/abc
|
|
# blocks: www.example.com/path/def
|
|
# allows: www.example.com/
|
|
# allows: www.example.com/some/
|
|
host www.example.com /path/
|
|
|
|
# path example
|
|
# ============
|
|
# blocks specific path (prefix)
|
|
# blocks: www.example.com/wp-admin, www.example.com/wp-admin/login
|
|
# allows: www.example.com/en/wp-admin
|
|
path /wp-admin
|
|
|
|
# regex example
|
|
# =============
|
|
# blocks url by regex
|
|
regex * https?://example\.com/(a|b)\.html
|
|
|
|
# blocks url by regex (specific domain)
|
|
regex example.com https?://example\.com/(a|b)\.html
|
|
|
|
# tld example
|
|
# ===========
|
|
# block entire country specific domain
|
|
# blocks: www.example.my
|
|
# blocks: www.example.com.my
|
|
# blocks: www.example.sg
|
|
tld my,sg
|