88import re
99import sys
1010import fnmatch
11+ import concurrent .futures
12+ import time
1113
1214try :
1315 import urllib2 as urllib
2022 URLLIB_NEW = True
2123
2224
23- log_info = sys .stderr
24- log_err = sys .stderr
25+ def setup_logging (output_file = None ):
26+ log_info = sys .stdout if output_file else sys .stderr
27+ log_err = sys .stderr
28+ return log_info , log_err
2529
2630
2731def parse_trusted_list (content ):
@@ -103,8 +107,10 @@ def print_restricted_name(output_fd, name, time_restrictions):
103107 )
104108
105109
106- def load_from_url (url ):
110+ def load_from_url (url , timeout ):
111+ log_info , log_err = setup_logging ()
107112 log_info .write ("Loading data from [{}]\n " .format (url ))
113+
108114 req = urllib .Request (url = url , headers = {"User-Agent" : "dnscrypt-proxy" })
109115 trusted = False
110116
@@ -117,7 +123,7 @@ def load_from_url(url):
117123
118124 response = None
119125 try :
120- response = urllib .urlopen (req , timeout = int (args . timeout ))
126+ response = urllib .urlopen (req , timeout = int (timeout ))
121127 except urllib .URLError as err :
122128 raise Exception ("[{}] could not be loaded: {}\n " .format (url , err ))
123129 if trusted is False and response .getcode () != 200 :
@@ -171,45 +177,79 @@ def has_suffix(names, name):
171177 parts = parts [1 :]
172178 if str .join ("." , parts ) in names :
173179 return True
174-
175180 return False
176181
177182
178- def allowlist_from_url (url ):
183+ def allowlist_from_url (url , timeout ):
179184 if not url :
180185 return set ()
181- content , trusted = load_from_url (url )
186+ content , trusted = load_from_url (url , timeout )
182187
183188 names , _time_restrictions , _globs = parse_list (content , trusted )
184189 return names
185190
186191
187- def blocklists_from_config_file (
188- file , allowlist , time_restricted_url , ignore_retrieval_failure , output_file
189- ):
192+ def load_url_with_retry (url , timeout , retries = 3 , retry_delay = 2 ):
193+ for attempt in range (retries ):
194+ try :
195+ return load_from_url (url , timeout )
196+ except Exception as e :
197+ if attempt < retries - 1 :
198+ time .sleep (retry_delay )
199+ else :
200+ raise e
201+
202+
203+ def load_blocklists_parallel (urls , timeout , ignore_retrieval_failure ):
204+ log_info , log_err = setup_logging ()
190205 blocklists = {}
191- allowed_names = set ()
192206 all_names = set ()
193- unique_names = set ()
194207 all_globs = set ()
195208
196- # Load conf & blocklists
197- with open (file ) as fd :
198- for line in fd :
199- line = str .strip (line )
200- if str .startswith (line , "#" ) or line == "" :
201- continue
202- url = line
209+ with concurrent .futures .ThreadPoolExecutor (max_workers = 10 ) as executor :
210+ future_to_url = {
211+ executor .submit (load_url_with_retry , url , timeout ): url
212+ for url in urls
213+ if url .strip () and not url .strip ().startswith ("#" )
214+ }
215+
216+ for future in concurrent .futures .as_completed (future_to_url ):
217+ url = future_to_url [future ]
203218 try :
204- content , trusted = load_from_url ( url )
219+ content , trusted = future . result ( )
205220 names , _time_restrictions , globs = parse_list (content , trusted )
206221 blocklists [url ] = names
207222 all_names |= names
208223 all_globs |= globs
209224 except Exception as e :
210225 log_err .write (str (e ))
211226 if not ignore_retrieval_failure :
212- exit (1 )
227+ sys .exit (1 )
228+
229+ return blocklists , all_names , all_globs
230+
231+
232+ def blocklists_from_config_file (
233+ file , allowlist , time_restricted_url , ignore_retrieval_failure , output_file , timeout
234+ ):
235+ log_info , log_err = setup_logging (output_file )
236+
237+ # Get URLs from config file
238+ urls = []
239+ with open (file ) as fd :
240+ for line in fd :
241+ line = str .strip (line )
242+ if str .startswith (line , "#" ) or line == "" :
243+ continue
244+ urls .append (line )
245+
246+ # Load blocklists in parallel
247+ blocklists , all_names , all_globs = load_blocklists_parallel (
248+ urls , timeout , ignore_retrieval_failure
249+ )
250+
251+ # Load allowed names
252+ allowed_names = set ()
213253
214254 # Time-based blocklist
215255 if time_restricted_url and not re .match (r"^[a-z0-9]+:" , time_restricted_url ):
@@ -220,36 +260,47 @@ def blocklists_from_config_file(
220260 output_fd = open (output_file , "w" )
221261
222262 if time_restricted_url :
223- time_restricted_content , _trusted = load_from_url (time_restricted_url )
224- time_restricted_names , time_restrictions , _globs = parse_trusted_list (
225- time_restricted_content
226- )
227-
228- if time_restricted_names :
229- print (
230- "########## Time-based blocklist ##########\n " , file = output_fd , end = "\n "
263+ try :
264+ time_restricted_content , _trusted = load_from_url (
265+ time_restricted_url , timeout
266+ )
267+ time_restricted_names , time_restrictions , _globs = parse_trusted_list (
268+ time_restricted_content
231269 )
232- for name in time_restricted_names :
233- print_restricted_name (output_fd , name , time_restrictions )
234270
235- # Time restricted names should be allowed, or they could be always blocked
236- allowed_names |= time_restricted_names
271+ if time_restricted_names :
272+ print (
273+ "########## Time-based blocklist ##########\n " ,
274+ file = output_fd ,
275+ end = "\n " ,
276+ )
277+ for name in time_restricted_names :
278+ print_restricted_name (output_fd , name , time_restrictions )
279+
280+ # Time restricted names should be allowed, or they could be always blocked
281+ allowed_names |= time_restricted_names
282+ except Exception as e :
283+ log_err .write (f"Error loading time-restricted list: { str (e )} \n " )
237284
238285 # Allowed list
239286 if allowlist and not re .match (r"^[a-z0-9]+:" , allowlist ):
240287 allowlist = "file:" + allowlist
241288
242- allowed_names |= allowlist_from_url (allowlist )
289+ try :
290+ allowed_names |= allowlist_from_url (allowlist , timeout )
291+ except Exception as e :
292+ log_err .write (f"Error loading allowlist: { str (e )} \n " )
243293
244294 # Process blocklists
295+ unique_names = set ()
245296 for url , names in blocklists .items ():
246297 print (
247298 "\n \n ########## Blocklist from {} ##########\n " .format (url ),
248299 file = output_fd ,
249300 end = "\n " ,
250301 )
251302 ignored , glob_ignored , allowed = 0 , 0 , 0
252- list_names = list ()
303+ list_names = []
253304 for name in names :
254305 if covered_by_glob (all_globs , name ):
255306 glob_ignored = glob_ignored + 1
@@ -284,64 +335,82 @@ def blocklists_from_config_file(
284335 output_fd .close ()
285336
286337
287- argp = argparse .ArgumentParser (
288- description = "Create a unified blocklist from a set of local and remote files"
289- )
290- argp .add_argument (
291- "-c" ,
292- "--config" ,
293- default = "domains-blocklist.conf" ,
294- help = "file containing blocklist sources" ,
295- )
296- argp .add_argument (
297- "-w" ,
298- "--whitelist" ,
299- help = argparse .SUPPRESS ,
300- )
301- argp .add_argument (
302- "-a" ,
303- "--allowlist" ,
304- default = "domains-allowlist.txt" ,
305- help = "file containing a set of names to exclude from the blocklist" ,
306- )
307- argp .add_argument (
308- "-r" ,
309- "--time-restricted" ,
310- default = "domains-time-restricted.txt" ,
311- help = "file containing a set of names to be time restricted" ,
312- )
313- argp .add_argument (
314- "-i" ,
315- "--ignore-retrieval-failure" ,
316- action = "store_true" ,
317- help = "generate list even if some urls couldn't be retrieved" ,
318- )
319- argp .add_argument (
320- "-o" ,
321- "--output-file" ,
322- default = None ,
323- help = "save generated blocklist to a text file with the provided file name" ,
324- )
325- argp .add_argument ("-t" , "--timeout" , default = 30 , help = "URL open timeout" )
326-
327- args = argp .parse_args ()
328-
329- whitelist = args .whitelist
330- if whitelist :
331- print (
332- "The option to provide a set of names to exclude from the blocklist has been changed from -w to -a\n "
338+ def main ():
339+ argp = argparse .ArgumentParser (
340+ description = "Create a unified blocklist from a set of local and remote files"
341+ )
342+ argp .add_argument (
343+ "-c" ,
344+ "--config" ,
345+ default = "domains-blocklist.conf" ,
346+ help = "file containing blocklist sources" ,
347+ )
348+ argp .add_argument (
349+ "-w" ,
350+ "--whitelist" ,
351+ help = argparse .SUPPRESS ,
333352 )
334- argp .print_help ()
335- exit (1 )
336-
337- conf = args .config
338- allowlist = args .allowlist
339- time_restricted = args .time_restricted
340- ignore_retrieval_failure = args .ignore_retrieval_failure
341- output_file = args .output_file
342- if output_file :
343- log_info = sys .stdout
344-
345- blocklists_from_config_file (
346- conf , allowlist , time_restricted , ignore_retrieval_failure , output_file
347- )
353+ argp .add_argument (
354+ "-a" ,
355+ "--allowlist" ,
356+ default = "domains-allowlist.txt" ,
357+ help = "file containing a set of names to exclude from the blocklist" ,
358+ )
359+ argp .add_argument (
360+ "-r" ,
361+ "--time-restricted" ,
362+ default = "domains-time-restricted.txt" ,
363+ help = "file containing a set of names to be time restricted" ,
364+ )
365+ argp .add_argument (
366+ "-i" ,
367+ "--ignore-retrieval-failure" ,
368+ action = "store_true" ,
369+ help = "generate list even if some urls couldn't be retrieved" ,
370+ )
371+ argp .add_argument (
372+ "-o" ,
373+ "--output-file" ,
374+ default = None ,
375+ help = "save generated blocklist to a text file with the provided file name" ,
376+ )
377+ argp .add_argument ("-t" , "--timeout" , default = 30 , help = "URL open timeout in seconds" )
378+ argp .add_argument (
379+ "-p" ,
380+ "--progress" ,
381+ action = "store_true" ,
382+ help = "show download progress information" ,
383+ )
384+
385+ args = argp .parse_args ()
386+
387+ whitelist = args .whitelist
388+ if whitelist :
389+ print (
390+ "The option to provide a set of names to exclude from the blocklist has been changed from -w to -a\n "
391+ )
392+ argp .print_help ()
393+ exit (1 )
394+
395+ start_time = time .time ()
396+
397+ log_info , _ = setup_logging (args .output_file )
398+ if args .progress :
399+ log_info .write ("Starting blocklist generation...\n " )
400+
401+ blocklists_from_config_file (
402+ args .config ,
403+ args .allowlist ,
404+ args .time_restricted ,
405+ args .ignore_retrieval_failure ,
406+ args .output_file ,
407+ args .timeout ,
408+ )
409+
410+ if args .progress :
411+ duration = time .time () - start_time
412+ log_info .write (f"Blocklist generation completed in { duration :.2f} seconds\n " )
413+
414+
415+ if __name__ == "__main__" :
416+ main ()
0 commit comments