33import requests
44import time
55from datetime import datetime
6+ from urllib .parse import urlparse
7+ from dotenv import load_dotenv
68
79# Configuration
810OUTPUT_DIR = "../Database-Files/Filter-Database/"
9- OUTPUT_FILE = "Global-URLs .json"
11+ OUTPUT_FILE = "Global-Domains .json"
1012OUTPUT_PATH = os .path .join (OUTPUT_DIR , OUTPUT_FILE )
13+ load_dotenv ()
14+
15+ EXCLUDED_DOMAINS = os .getenv ("EXCLUDED_DOMAINS" , "" ).split ("," )
1116
1217# Sources
1318SOURCES = [
@@ -67,6 +72,9 @@ def fetch_data(source):
6772 url = account_data [field ]
6873 # Filter out excluded values
6974 if url and url not in source ["exclude_values" ]:
75+ # Strip 'http://' or 'https://' from URLs for DART Project
76+ if source ["name" ] == "DART Project" :
77+ url = url .replace ("https://" , "" ).replace ("http://" , "" )
7078 extracted_urls .append (url )
7179 print (f"[+] Extracted { len (extracted_urls )} URLs from { source ['name' ]} " )
7280 return extracted_urls
@@ -102,6 +110,42 @@ def save_urls(urls):
102110 print (f"[+] Saved { len (urls )} URLs to { OUTPUT_PATH } " )
103111
104112
113+ def is_url_excluded (url ):
114+ """Check if the URL contains any excluded domain."""
115+ domain = urlparse (url ).netloc
116+ for excluded_domain in EXCLUDED_DOMAINS :
117+ if excluded_domain in domain :
118+ return True
119+ return False
120+
121+
122+ def get_base_domain (url ):
123+ """Extract the base domain from a URL (without www or subdomains)."""
124+ try :
125+ # Handle cases where URL might not have scheme
126+ if not url .startswith (("http://" , "https://" )):
127+ url = "http://" + url
128+
129+ parsed = urlparse (url )
130+ domain_parts = parsed .netloc .split ("." )
131+
132+ # Handle cases like 'example.com' or 'www.example.com'
133+ if len (domain_parts ) > 2 :
134+ # For subdomains, we take the last two parts (e.g., 'example.com' from 'sub.example.com')
135+ base_domain = "." .join (domain_parts [- 2 :])
136+ else :
137+ base_domain = parsed .netloc
138+
139+ # Remove www. if present
140+ if base_domain .startswith ("www." ):
141+ base_domain = base_domain [4 :]
142+
143+ return base_domain .lower ()
144+ except :
145+ # Fallback for malformed URLs
146+ return url .lower ()
147+
148+
105149def main ():
106150 """Main function to fetch and process URLs."""
107151 start_time = time .time ()
@@ -117,13 +161,17 @@ def main():
117161 existing_count = len (existing_urls )
118162 print (f"[+] Found { existing_count } existing URLs" )
119163
120- # Create a set for faster lookups and to ensure uniqueness
121- url_set = set ( existing_urls )
164+ # Create a set of base domains for existing URLs
165+ existing_base_domains = { get_base_domain ( url ) for url in existing_urls }
122166
123- # Fetch and process each source
167+ # Initialize counters
124168 total_new_urls = 0
169+ total_skipped_urls = 0
125170 source_stats = {}
126171
172+ # We'll store both the full URL and its base domain to maintain uniqueness
173+ url_dict = {url : get_base_domain (url ) for url in existing_urls }
174+
127175 for source in SOURCES :
128176 source_data = fetch_data (source )
129177 if not source_data :
@@ -134,16 +182,27 @@ def main():
134182 # Count new URLs from this source
135183 new_from_source = 0
136184 for url in source_data :
137- if url not in url_set :
138- url_set .add (url )
185+ # Skip excluded URLs
186+ if is_url_excluded (url ):
187+ total_skipped_urls += 1
188+ continue
189+
190+ base_domain = get_base_domain (url )
191+
192+ # Check if this base domain already exists
193+ if base_domain not in existing_base_domains :
194+ # Add the simplest form of the URL (just domain)
195+ simple_url = base_domain
196+ url_dict [simple_url ] = base_domain
197+ existing_base_domains .add (base_domain )
139198 new_from_source += 1
140199
141200 total_new_urls += new_from_source
142201 source_stats [source ["name" ]] = new_from_source
143202 print (f"[+] Added { new_from_source } new URLs from { source ['name' ]} " )
144203
145- # Convert set back to sorted list for saving
146- all_urls = sorted (list ( url_set ) )
204+ # Convert to sorted list of just the domains (no paths)
205+ all_urls = sorted ({ k for k , v in url_dict . items ()} )
147206
148207 # Save results
149208 save_urls (all_urls )
@@ -155,6 +214,7 @@ def main():
155214 print (f" - Starting URLs: { existing_count } " )
156215 print (f" - New URLs added: { total_new_urls } " )
157216 print (f" - Total unique URLs: { len (all_urls )} " )
217+ print (f" - Total URLs skipped/excluded: { total_skipped_urls } " )
158218
159219 # Print breakdown by source
160220 print (f"[+] New URLs by source:" )
0 commit comments