Skip to content
This repository was archived by the owner on Apr 26, 2026. It is now read-only.

Commit 84a11e7

Browse files
committed
improved data fetcher tool
1 parent 4bca9f8 commit 84a11e7

1 file changed

Lines changed: 68 additions & 8 deletions

File tree

Tool-22-URL-Data-Fetcher.py

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,16 @@
33
import requests
44
import time
55
from datetime import datetime
6+
from urllib.parse import urlparse
7+
from dotenv import load_dotenv
68

79
# Configuration
810
OUTPUT_DIR = "../Database-Files/Filter-Database/"
9-
OUTPUT_FILE = "Global-URLs.json"
11+
OUTPUT_FILE = "Global-Domains.json"
1012
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
13+
load_dotenv()
14+
15+
EXCLUDED_DOMAINS = os.getenv("EXCLUDED_DOMAINS", "").split(",")
1116

1217
# Sources
1318
SOURCES = [
@@ -67,6 +72,9 @@ def fetch_data(source):
6772
url = account_data[field]
6873
# Filter out excluded values
6974
if url and url not in source["exclude_values"]:
75+
# Strip 'http://' or 'https://' from URLs for DART Project
76+
if source["name"] == "DART Project":
77+
url = url.replace("https://", "").replace("http://", "")
7078
extracted_urls.append(url)
7179
print(f"[+] Extracted {len(extracted_urls)} URLs from {source['name']}")
7280
return extracted_urls
@@ -102,6 +110,42 @@ def save_urls(urls):
102110
print(f"[+] Saved {len(urls)} URLs to {OUTPUT_PATH}")
103111

104112

113+
def is_url_excluded(url):
114+
"""Check if the URL contains any excluded domain."""
115+
domain = urlparse(url).netloc
116+
for excluded_domain in EXCLUDED_DOMAINS:
117+
if excluded_domain in domain:
118+
return True
119+
return False
120+
121+
122+
def get_base_domain(url):
123+
"""Extract the base domain from a URL (without www or subdomains)."""
124+
try:
125+
# Handle cases where URL might not have scheme
126+
if not url.startswith(("http://", "https://")):
127+
url = "http://" + url
128+
129+
parsed = urlparse(url)
130+
domain_parts = parsed.netloc.split(".")
131+
132+
# Handle cases like 'example.com' or 'www.example.com'
133+
if len(domain_parts) > 2:
134+
# For subdomains, we take the last two parts (e.g., 'example.com' from 'sub.example.com')
135+
base_domain = ".".join(domain_parts[-2:])
136+
else:
137+
base_domain = parsed.netloc
138+
139+
# Remove www. if present
140+
if base_domain.startswith("www."):
141+
base_domain = base_domain[4:]
142+
143+
return base_domain.lower()
144+
except:
145+
# Fallback for malformed URLs
146+
return url.lower()
147+
148+
105149
def main():
106150
"""Main function to fetch and process URLs."""
107151
start_time = time.time()
@@ -117,13 +161,17 @@ def main():
117161
existing_count = len(existing_urls)
118162
print(f"[+] Found {existing_count} existing URLs")
119163

120-
# Create a set for faster lookups and to ensure uniqueness
121-
url_set = set(existing_urls)
164+
# Create a set of base domains for existing URLs
165+
existing_base_domains = {get_base_domain(url) for url in existing_urls}
122166

123-
# Fetch and process each source
167+
# Initialize counters
124168
total_new_urls = 0
169+
total_skipped_urls = 0
125170
source_stats = {}
126171

172+
# We'll store both the full URL and its base domain to maintain uniqueness
173+
url_dict = {url: get_base_domain(url) for url in existing_urls}
174+
127175
for source in SOURCES:
128176
source_data = fetch_data(source)
129177
if not source_data:
@@ -134,16 +182,27 @@ def main():
134182
# Count new URLs from this source
135183
new_from_source = 0
136184
for url in source_data:
137-
if url not in url_set:
138-
url_set.add(url)
185+
# Skip excluded URLs
186+
if is_url_excluded(url):
187+
total_skipped_urls += 1
188+
continue
189+
190+
base_domain = get_base_domain(url)
191+
192+
# Check if this base domain already exists
193+
if base_domain not in existing_base_domains:
194+
# Add the simplest form of the URL (just domain)
195+
simple_url = base_domain
196+
url_dict[simple_url] = base_domain
197+
existing_base_domains.add(base_domain)
139198
new_from_source += 1
140199

141200
total_new_urls += new_from_source
142201
source_stats[source["name"]] = new_from_source
143202
print(f"[+] Added {new_from_source} new URLs from {source['name']}")
144203

145-
# Convert set back to sorted list for saving
146-
all_urls = sorted(list(url_set))
204+
# Convert to sorted list of just the domains (no paths)
205+
all_urls = sorted({k for k, v in url_dict.items()})
147206

148207
# Save results
149208
save_urls(all_urls)
@@ -155,6 +214,7 @@ def main():
155214
print(f" - Starting URLs: {existing_count}")
156215
print(f" - New URLs added: {total_new_urls}")
157216
print(f" - Total unique URLs: {len(all_urls)}")
217+
print(f" - Total URLs skipped/excluded: {total_skipped_urls}")
158218

159219
# Print breakdown by source
160220
print(f"[+] New URLs by source:")

0 commit comments

Comments
 (0)