Skip to content

Commit 5d30345

Browse files
committed
Impove generate-domains-blocklist.py for bad network situations
* Multithread makes the logs less clear, improve them. * Cancel unstarted tasks to exit early. * Catch more exceptions.
1 parent d44b0e3 commit 5d30345

1 file changed

Lines changed: 37 additions & 14 deletions

File tree

utils/generate-domains-blocklist/generate-domains-blocklist.py

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,6 @@ def print_restricted_name(output_fd, name, time_restrictions):
108108

109109

110110
def load_from_url(url, timeout):
111-
log_info, log_err = setup_logging()
112-
log_info.write("Loading data from [{}]\n".format(url))
113-
114111
req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
115112
trusted = False
116113

@@ -124,11 +121,11 @@ def load_from_url(url, timeout):
124121
response = None
125122
try:
126123
response = urllib.urlopen(req, timeout=int(timeout))
127-
except urllib.URLError as err:
128-
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
124+
content = response.read() # "The read operation timed out"
125+
except Exception as err:
126+
raise Exception("[{}] could not be loaded: {}".format(url, err))
129127
if trusted is False and response.getcode() != 200:
130-
raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
131-
content = response.read()
128+
raise Exception("[{}] returned HTTP code {}".format(url, response.getcode()))
132129
if URLLIB_NEW:
133130
content = content.decode("utf-8", errors="replace")
134131

@@ -188,13 +185,22 @@ def allowlist_from_url(url, timeout):
188185
names, _time_restrictions, _globs = parse_list(content, trusted)
189186
return names
190187

188+
STOP_RETRY = False
191189

192-
def load_url_with_retry(url, timeout, retries=3, retry_delay=2):
193-
for attempt in range(retries):
190+
def load_url_with_retry(url, timeout, tries=3, retry_delay=2):
191+
log_info, log_err = setup_logging()
192+
for attempt in range(tries):
193+
try_msg = f"try: {attempt + 1}/{tries}"
194194
try:
195-
return load_from_url(url, timeout)
195+
log_info.write(f"[{try_msg}] Loading data from [{url}]\n")
196+
content, trusted = load_from_url(url, timeout)
197+
log_err.write(f"[{try_msg}] [{url}] OK\n")
198+
return content, trusted
196199
except Exception as e:
197-
if attempt < retries - 1:
200+
log_err.write(f"[{try_msg}] {e}\n")
201+
if STOP_RETRY:
202+
break
203+
if attempt < tries - 1:
198204
time.sleep(retry_delay)
199205
else:
200206
raise e
@@ -210,10 +216,27 @@ def load_blocklists_parallel(urls, timeout, ignore_retrieval_failure):
210216
future_to_url = {
211217
executor.submit(load_url_with_retry, url, timeout): url
212218
for url in urls
213-
if url.strip() and not url.strip().startswith("#")
214219
}
215220

216-
for future in concurrent.futures.as_completed(future_to_url):
221+
# Useful for bad network situations
222+
return_when = concurrent.futures.FIRST_EXCEPTION
223+
if ignore_retrieval_failure:
224+
return_when = concurrent.futures.ALL_COMPLETED
225+
finished, unfinished = concurrent.futures.wait(future_to_url, None, return_when)
226+
# Return early
227+
if len(unfinished) > 0:
228+
# Cancel unstarted tasks
229+
for f in unfinished:
230+
if not f.done():
231+
f.cancel()
232+
# Stop retries
233+
global STOP_RETRY
234+
STOP_RETRY = True
235+
# Threads won't be terminated forcibly
236+
if not ignore_retrieval_failure:
237+
sys.exit(1)
238+
239+
for future in finished:
217240
url = future_to_url[future]
218241
try:
219242
content, trusted = future.result()
@@ -222,7 +245,7 @@ def load_blocklists_parallel(urls, timeout, ignore_retrieval_failure):
222245
all_names |= names
223246
all_globs |= globs
224247
except Exception as e:
225-
log_err.write(str(e))
248+
log_err.write(f"{e}\n")
226249
if not ignore_retrieval_failure:
227250
sys.exit(1)
228251

0 commit comments

Comments
 (0)