Skip to content

Commit a948436

Browse files
committed
generate-domains-blocklist.py is now super fast!
1 parent 4953ade commit a948436

2 files changed

Lines changed: 172 additions & 99 deletions

File tree

ChangeLog

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,18 @@ modify filtering rules and other configurations without restarting the proxy.
66
Simply edit a configuration file (like blocked-names.txt) and changes are applied
77
instantaneously.
88
- HTTP/3 probing is now supported via the `http3_probe` option, which will
9-
try HTTP/3 first for DoH servers, even if they don't advertise support via Alt-Svc.
10-
- Authentication for the monitoring UI can be disabled by setting the username
11-
to an empty string in the configuration.
9+
try HTTP/3 first for DoH servers, even if they don't advertise support via
10+
Alt-Svc.
11+
- Authentication for the monitoring UI can be disabled by setting the
12+
username to an empty string in the configuration.
1213
- Several race conditions have been fixed.
1314
- Dependencies have been updated.
1415
- DHCP DNS detector instances have been reduced to improve performance.
1516
- Tor isolation for dnscrypt-proxy has been documented to enhance privacy.
16-
- The default example configuration file has been improved for clarity and usability.
17+
- The default example configuration file has been improved for clarity and
18+
usability.
19+
- generate-domains-blocklist: added parallel downloading of block lists for
20+
significantly improved performance.
1721

1822
# Version 2.1.8
1923
- Dependencies have been updated, notably the QUIC implementation,

utils/generate-domains-blocklist/generate-domains-blocklist.py

Lines changed: 164 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import re
99
import sys
1010
import fnmatch
11+
import concurrent.futures
12+
import time
1113

1214
try:
1315
import urllib2 as urllib
@@ -20,8 +22,10 @@
2022
URLLIB_NEW = True
2123

2224

23-
log_info = sys.stderr
24-
log_err = sys.stderr
25+
def setup_logging(output_file=None):
26+
log_info = sys.stdout if output_file else sys.stderr
27+
log_err = sys.stderr
28+
return log_info, log_err
2529

2630

2731
def parse_trusted_list(content):
@@ -103,8 +107,10 @@ def print_restricted_name(output_fd, name, time_restrictions):
103107
)
104108

105109

106-
def load_from_url(url):
110+
def load_from_url(url, timeout):
111+
log_info, log_err = setup_logging()
107112
log_info.write("Loading data from [{}]\n".format(url))
113+
108114
req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
109115
trusted = False
110116

@@ -117,7 +123,7 @@ def load_from_url(url):
117123

118124
response = None
119125
try:
120-
response = urllib.urlopen(req, timeout=int(args.timeout))
126+
response = urllib.urlopen(req, timeout=int(timeout))
121127
except urllib.URLError as err:
122128
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
123129
if trusted is False and response.getcode() != 200:
@@ -171,45 +177,79 @@ def has_suffix(names, name):
171177
parts = parts[1:]
172178
if str.join(".", parts) in names:
173179
return True
174-
175180
return False
176181

177182

178-
def allowlist_from_url(url):
183+
def allowlist_from_url(url, timeout):
179184
if not url:
180185
return set()
181-
content, trusted = load_from_url(url)
186+
content, trusted = load_from_url(url, timeout)
182187

183188
names, _time_restrictions, _globs = parse_list(content, trusted)
184189
return names
185190

186191

187-
def blocklists_from_config_file(
188-
file, allowlist, time_restricted_url, ignore_retrieval_failure, output_file
189-
):
192+
def load_url_with_retry(url, timeout, retries=3, retry_delay=2):
193+
for attempt in range(retries):
194+
try:
195+
return load_from_url(url, timeout)
196+
except Exception as e:
197+
if attempt < retries - 1:
198+
time.sleep(retry_delay)
199+
else:
200+
raise e
201+
202+
203+
def load_blocklists_parallel(urls, timeout, ignore_retrieval_failure):
204+
log_info, log_err = setup_logging()
190205
blocklists = {}
191-
allowed_names = set()
192206
all_names = set()
193-
unique_names = set()
194207
all_globs = set()
195208

196-
# Load conf & blocklists
197-
with open(file) as fd:
198-
for line in fd:
199-
line = str.strip(line)
200-
if str.startswith(line, "#") or line == "":
201-
continue
202-
url = line
209+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
210+
future_to_url = {
211+
executor.submit(load_url_with_retry, url, timeout): url
212+
for url in urls
213+
if url.strip() and not url.strip().startswith("#")
214+
}
215+
216+
for future in concurrent.futures.as_completed(future_to_url):
217+
url = future_to_url[future]
203218
try:
204-
content, trusted = load_from_url(url)
219+
content, trusted = future.result()
205220
names, _time_restrictions, globs = parse_list(content, trusted)
206221
blocklists[url] = names
207222
all_names |= names
208223
all_globs |= globs
209224
except Exception as e:
210225
log_err.write(str(e))
211226
if not ignore_retrieval_failure:
212-
exit(1)
227+
sys.exit(1)
228+
229+
return blocklists, all_names, all_globs
230+
231+
232+
def blocklists_from_config_file(
233+
file, allowlist, time_restricted_url, ignore_retrieval_failure, output_file, timeout
234+
):
235+
log_info, log_err = setup_logging(output_file)
236+
237+
# Get URLs from config file
238+
urls = []
239+
with open(file) as fd:
240+
for line in fd:
241+
line = str.strip(line)
242+
if str.startswith(line, "#") or line == "":
243+
continue
244+
urls.append(line)
245+
246+
# Load blocklists in parallel
247+
blocklists, all_names, all_globs = load_blocklists_parallel(
248+
urls, timeout, ignore_retrieval_failure
249+
)
250+
251+
# Load allowed names
252+
allowed_names = set()
213253

214254
# Time-based blocklist
215255
if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
@@ -220,36 +260,47 @@ def blocklists_from_config_file(
220260
output_fd = open(output_file, "w")
221261

222262
if time_restricted_url:
223-
time_restricted_content, _trusted = load_from_url(time_restricted_url)
224-
time_restricted_names, time_restrictions, _globs = parse_trusted_list(
225-
time_restricted_content
226-
)
227-
228-
if time_restricted_names:
229-
print(
230-
"########## Time-based blocklist ##########\n", file=output_fd, end="\n"
263+
try:
264+
time_restricted_content, _trusted = load_from_url(
265+
time_restricted_url, timeout
266+
)
267+
time_restricted_names, time_restrictions, _globs = parse_trusted_list(
268+
time_restricted_content
231269
)
232-
for name in time_restricted_names:
233-
print_restricted_name(output_fd, name, time_restrictions)
234270

235-
# Time restricted names should be allowed, or they could be always blocked
236-
allowed_names |= time_restricted_names
271+
if time_restricted_names:
272+
print(
273+
"########## Time-based blocklist ##########\n",
274+
file=output_fd,
275+
end="\n",
276+
)
277+
for name in time_restricted_names:
278+
print_restricted_name(output_fd, name, time_restrictions)
279+
280+
# Time restricted names should be allowed, or they could be always blocked
281+
allowed_names |= time_restricted_names
282+
except Exception as e:
283+
log_err.write(f"Error loading time-restricted list: {str(e)}\n")
237284

238285
# Allowed list
239286
if allowlist and not re.match(r"^[a-z0-9]+:", allowlist):
240287
allowlist = "file:" + allowlist
241288

242-
allowed_names |= allowlist_from_url(allowlist)
289+
try:
290+
allowed_names |= allowlist_from_url(allowlist, timeout)
291+
except Exception as e:
292+
log_err.write(f"Error loading allowlist: {str(e)}\n")
243293

244294
# Process blocklists
295+
unique_names = set()
245296
for url, names in blocklists.items():
246297
print(
247298
"\n\n########## Blocklist from {} ##########\n".format(url),
248299
file=output_fd,
249300
end="\n",
250301
)
251302
ignored, glob_ignored, allowed = 0, 0, 0
252-
list_names = list()
303+
list_names = []
253304
for name in names:
254305
if covered_by_glob(all_globs, name):
255306
glob_ignored = glob_ignored + 1
@@ -284,64 +335,82 @@ def blocklists_from_config_file(
284335
output_fd.close()
285336

286337

287-
argp = argparse.ArgumentParser(
288-
description="Create a unified blocklist from a set of local and remote files"
289-
)
290-
argp.add_argument(
291-
"-c",
292-
"--config",
293-
default="domains-blocklist.conf",
294-
help="file containing blocklist sources",
295-
)
296-
argp.add_argument(
297-
"-w",
298-
"--whitelist",
299-
help=argparse.SUPPRESS,
300-
)
301-
argp.add_argument(
302-
"-a",
303-
"--allowlist",
304-
default="domains-allowlist.txt",
305-
help="file containing a set of names to exclude from the blocklist",
306-
)
307-
argp.add_argument(
308-
"-r",
309-
"--time-restricted",
310-
default="domains-time-restricted.txt",
311-
help="file containing a set of names to be time restricted",
312-
)
313-
argp.add_argument(
314-
"-i",
315-
"--ignore-retrieval-failure",
316-
action="store_true",
317-
help="generate list even if some urls couldn't be retrieved",
318-
)
319-
argp.add_argument(
320-
"-o",
321-
"--output-file",
322-
default=None,
323-
help="save generated blocklist to a text file with the provided file name",
324-
)
325-
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
326-
327-
args = argp.parse_args()
328-
329-
whitelist = args.whitelist
330-
if whitelist:
331-
print(
332-
"The option to provide a set of names to exclude from the blocklist has been changed from -w to -a\n"
338+
def main():
339+
argp = argparse.ArgumentParser(
340+
description="Create a unified blocklist from a set of local and remote files"
341+
)
342+
argp.add_argument(
343+
"-c",
344+
"--config",
345+
default="domains-blocklist.conf",
346+
help="file containing blocklist sources",
347+
)
348+
argp.add_argument(
349+
"-w",
350+
"--whitelist",
351+
help=argparse.SUPPRESS,
333352
)
334-
argp.print_help()
335-
exit(1)
336-
337-
conf = args.config
338-
allowlist = args.allowlist
339-
time_restricted = args.time_restricted
340-
ignore_retrieval_failure = args.ignore_retrieval_failure
341-
output_file = args.output_file
342-
if output_file:
343-
log_info = sys.stdout
344-
345-
blocklists_from_config_file(
346-
conf, allowlist, time_restricted, ignore_retrieval_failure, output_file
347-
)
353+
argp.add_argument(
354+
"-a",
355+
"--allowlist",
356+
default="domains-allowlist.txt",
357+
help="file containing a set of names to exclude from the blocklist",
358+
)
359+
argp.add_argument(
360+
"-r",
361+
"--time-restricted",
362+
default="domains-time-restricted.txt",
363+
help="file containing a set of names to be time restricted",
364+
)
365+
argp.add_argument(
366+
"-i",
367+
"--ignore-retrieval-failure",
368+
action="store_true",
369+
help="generate list even if some urls couldn't be retrieved",
370+
)
371+
argp.add_argument(
372+
"-o",
373+
"--output-file",
374+
default=None,
375+
help="save generated blocklist to a text file with the provided file name",
376+
)
377+
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout in seconds")
378+
argp.add_argument(
379+
"-p",
380+
"--progress",
381+
action="store_true",
382+
help="show download progress information",
383+
)
384+
385+
args = argp.parse_args()
386+
387+
whitelist = args.whitelist
388+
if whitelist:
389+
print(
390+
"The option to provide a set of names to exclude from the blocklist has been changed from -w to -a\n"
391+
)
392+
argp.print_help()
393+
exit(1)
394+
395+
start_time = time.time()
396+
397+
log_info, _ = setup_logging(args.output_file)
398+
if args.progress:
399+
log_info.write("Starting blocklist generation...\n")
400+
401+
blocklists_from_config_file(
402+
args.config,
403+
args.allowlist,
404+
args.time_restricted,
405+
args.ignore_retrieval_failure,
406+
args.output_file,
407+
args.timeout,
408+
)
409+
410+
if args.progress:
411+
duration = time.time() - start_time
412+
log_info.write(f"Blocklist generation completed in {duration:.2f} seconds\n")
413+
414+
415+
if __name__ == "__main__":
416+
main()

0 commit comments

Comments
 (0)