Skip to content
This repository was archived by the owner on Apr 26, 2026. It is now read-only.

Commit d5cffd9

Browse files
committed
renamed beta tools
1 parent 5cfa2bb commit d5cffd9

2 files changed

Lines changed: 178 additions & 38 deletions

File tree

Lines changed: 178 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def is_valid_url(url):
5151
# Basic URL validation - check for domain structure
5252
# This simple check ensures the URL has at least something.domain format
5353
has_domain_structure = (
54-
re.search(r"[a-zA-Z0-9][\w.-]*\.[a-zA-Z]{2,}", url) is not None
54+
re.search(r"[a-zA-Z0-9][\w.-]*\.[a-zA-Z]{2,}", url) is not None
5555
)
5656

5757
return has_domain_structure
@@ -62,6 +62,23 @@ def is_discord_url(url):
6262
return is_valid_url(url) and ("discord.gg" in url or "discord.com" in url)
6363

6464

65+
def extract_discord_invite_id(url):
66+
"""Extract the invite ID from a Discord URL regardless of format.
67+
68+
Handles both discord.gg/INVITEID and discord.com/invite/INVITEID formats.
69+
"""
70+
if not is_discord_url(url):
71+
return None
72+
73+
# Match patterns like discord.gg/INVITEID or discord.com/invite/INVITEID
74+
pattern = r"(?:discord\.gg\/|discord\.com\/invite\/)([a-zA-Z0-9]+)"
75+
match = re.search(pattern, url, re.IGNORECASE)
76+
77+
if match:
78+
return match.group(1).lower() # Return the ID in lowercase for consistent comparison
79+
return None
80+
81+
6582
def process_database():
6683
print("\n" + "=" * 80)
6784
print("DISCORD DATABASE FILTER TOOL - STARTING PROCESS")
@@ -91,6 +108,7 @@ def process_database():
91108
new_urls = 0
92109
new_discord_servers = 0
93110
invalid_urls_skipped = 0
111+
duplicate_invites_skipped = 0
94112

95113
# Load existing data if files exist
96114
discord_ids_data = {}
@@ -135,24 +153,111 @@ def process_database():
135153
f"No existing Discord servers file found, will create new file: {discord_servers_file}"
136154
)
137155

138-
# Clean existing data - remove any invalid URLs that might have been previously added
139-
urls_before_cleaning = len(urls_data)
140-
discord_servers_before_cleaning = len(discord_servers_data)
156+
# Convert old format to new format if needed
157+
# The old format uses URLs as keys, the new format uses DISCORD_SERVER_X as keys
158+
converted_discord_servers = {}
159+
160+
# Check if we need to convert the format (if any URL is used as a key)
161+
needs_conversion = False
162+
for key in discord_servers_data:
163+
if is_discord_url(key):
164+
needs_conversion = True
165+
break
166+
167+
# Create a mapping of invite IDs to server keys for deduplication
168+
invite_id_to_key_map = {}
169+
170+
if needs_conversion:
171+
print("\nConverting Discord servers data to new format...")
172+
server_index = 1
173+
174+
for key, data in discord_servers_data.items():
175+
if is_discord_url(key):
176+
# This is in the old format, extract the invite ID
177+
invite_id = extract_discord_invite_id(key)
178+
179+
if invite_id and invite_id in invite_id_to_key_map:
180+
# Skip this duplicate
181+
duplicate_invites_skipped += 1
182+
continue
183+
184+
# Create new entry in the new format
185+
new_key = f"DISCORD_SERVER_{server_index}"
186+
converted_discord_servers[new_key] = {
187+
"INVITE_URL": key,
188+
"FOUND_ON": data.get("FOUND_ON", 0),
189+
"SERVER_ID": "UNKNOWN",
190+
"REASON": "UNKNOWN"
191+
}
141192

142-
urls_data = {url: data for url, data in urls_data.items() if is_valid_url(url)}
143-
discord_servers_data = {
144-
url: data for url, data in discord_servers_data.items() if is_valid_url(url)
145-
}
193+
if invite_id:
194+
invite_id_to_key_map[invite_id] = new_key
195+
server_index += 1
196+
else:
197+
# This is already in the new format, keep it but check for duplicates
198+
if isinstance(data, dict) and "INVITE_URL" in data:
199+
invite_id = extract_discord_invite_id(data["INVITE_URL"])
200+
201+
if invite_id:
202+
if invite_id in invite_id_to_key_map:
203+
# This is a duplicate, skip it
204+
duplicate_invites_skipped += 1
205+
continue
206+
invite_id_to_key_map[invite_id] = key
146207

208+
converted_discord_servers[key] = data
209+
210+
if duplicate_invites_skipped > 0:
211+
print(f"Skipped {duplicate_invites_skipped} duplicate Discord invites during conversion")
212+
213+
print(f"Converted {len(converted_discord_servers)} Discord servers to new format")
214+
discord_servers_data = converted_discord_servers
215+
else:
216+
# Build the invite ID to key map for deduplication
217+
for key, data in discord_servers_data.items():
218+
if isinstance(data, dict) and "INVITE_URL" in data:
219+
invite_id = extract_discord_invite_id(data["INVITE_URL"])
220+
if invite_id:
221+
invite_id_to_key_map[invite_id] = key
222+
223+
# Deduplicate existing entries
224+
if not needs_conversion: # Only if we didn't already deduplicate during conversion
225+
print("\nChecking for duplicate Discord invites in existing data...")
226+
keys_to_remove = set()
227+
228+
# First pass: identify duplicates
229+
seen_invite_ids = set()
230+
for key, data in discord_servers_data.items():
231+
if isinstance(data, dict) and "INVITE_URL" in data:
232+
invite_id = extract_discord_invite_id(data["INVITE_URL"])
233+
if invite_id:
234+
if invite_id in seen_invite_ids:
235+
keys_to_remove.add(key)
236+
duplicate_invites_skipped += 1
237+
else:
238+
seen_invite_ids.add(invite_id)
239+
240+
# Second pass: remove duplicates
241+
for key in keys_to_remove:
242+
del discord_servers_data[key]
243+
244+
if duplicate_invites_skipped > 0:
245+
print(f"Removed {duplicate_invites_skipped} duplicate Discord invites from existing data")
246+
# Rebuild the mapping after deduplication
247+
invite_id_to_key_map = {}
248+
for key, data in discord_servers_data.items():
249+
if isinstance(data, dict) and "INVITE_URL" in data:
250+
invite_id = extract_discord_invite_id(data["INVITE_URL"])
251+
if invite_id:
252+
invite_id_to_key_map[invite_id] = key
253+
254+
# Clean existing data - remove any invalid URLs
255+
urls_before_cleaning = len(urls_data)
256+
urls_data = {url: data for url, data in urls_data.items() if is_valid_url(url)}
147257
cleaned_urls = urls_before_cleaning - len(urls_data)
148-
cleaned_discord_servers = discord_servers_before_cleaning - len(
149-
discord_servers_data
150-
)
151258

152-
if cleaned_urls > 0 or cleaned_discord_servers > 0:
153-
print(
154-
f"\nCleaned up {cleaned_urls} invalid URLs and {cleaned_discord_servers} invalid Discord server URLs from existing data"
155-
)
259+
if cleaned_urls > 0:
260+
print(f"\nCleaned up {cleaned_urls} invalid URLs from existing data")
156261

157262
# Read and process the main database
158263
try:
@@ -166,6 +271,17 @@ def process_database():
166271

167272
print("\nProcessing database...")
168273
processed_count = 0
274+
duplicate_count = 0
275+
276+
# Find the next available server index
277+
next_server_index = 1
278+
for key in discord_servers_data.keys():
279+
if key.startswith("DISCORD_SERVER_"):
280+
try:
281+
index = int(key.split("_")[2])
282+
next_server_index = max(next_server_index, index + 1)
283+
except (ValueError, IndexError):
284+
pass
169285

170286
# Process each account
171287
for account_key, account_info in accounts_data.items():
@@ -185,27 +301,39 @@ def process_database():
185301
"TYPE": account_type,
186302
}
187303
new_discord_ids += 1
188-
if (
189-
new_discord_ids % 10 == 0
190-
): # Log less frequently to avoid excessive output
191-
print(
192-
f" Added new Discord ID: {discord_id} (Type: {account_type})"
193-
)
304+
if new_discord_ids % 10 == 0:
305+
print(f" Added new Discord ID: {discord_id} (Type: {account_type})")
194306

195307
# Process URLs
196308
final_url = account_info.get("FINAL_URL")
309+
found_date = convert_date_to_epoch(account_info.get("FOUND_ON", ""))
197310

198311
if final_url:
199312
if not is_valid_url(final_url):
200313
invalid_urls_skipped += 1
201314
continue
202315

203-
found_date = convert_date_to_epoch(account_info.get("FOUND_ON", ""))
204-
205316
# Check if it's a Discord server URL
206317
if is_discord_url(final_url):
207-
if final_url not in discord_servers_data:
208-
discord_servers_data[final_url] = {"FOUND_ON": found_date}
318+
# Extract the invite ID to check for duplicates
319+
invite_id = extract_discord_invite_id(final_url)
320+
321+
if invite_id:
322+
if invite_id in invite_id_to_key_map:
323+
# This is a duplicate, skip it
324+
duplicate_count += 1
325+
continue
326+
327+
# Create a new entry in the new format
328+
new_key = f"DISCORD_SERVER_{next_server_index}"
329+
discord_servers_data[new_key] = {
330+
"INVITE_URL": final_url,
331+
"FOUND_ON": found_date,
332+
"SERVER_ID": "UNKNOWN",
333+
"REASON": "UNKNOWN"
334+
}
335+
invite_id_to_key_map[invite_id] = new_key
336+
next_server_index += 1
209337
new_discord_servers += 1
210338
if new_discord_servers % 10 == 0:
211339
print(f" Added new Discord server URL: {final_url}")
@@ -220,20 +348,35 @@ def process_database():
220348
# Also check SURFACE_URL for Discord links
221349
surface_url = account_info.get("SURFACE_URL")
222350
if surface_url and is_valid_url(surface_url) and is_discord_url(surface_url):
223-
if surface_url not in discord_servers_data:
224-
found_date = convert_date_to_epoch(account_info.get("FOUND_ON", ""))
225-
discord_servers_data[surface_url] = {"FOUND_ON": found_date}
351+
# Extract the invite ID to check for duplicates
352+
invite_id = extract_discord_invite_id(surface_url)
353+
354+
if invite_id:
355+
if invite_id in invite_id_to_key_map:
356+
# This is a duplicate, skip it
357+
duplicate_count += 1
358+
continue
359+
360+
# Create a new entry in the new format
361+
new_key = f"DISCORD_SERVER_{next_server_index}"
362+
discord_servers_data[new_key] = {
363+
"INVITE_URL": surface_url,
364+
"FOUND_ON": found_date,
365+
"SERVER_ID": "UNKNOWN",
366+
"REASON": "UNKNOWN"
367+
}
368+
invite_id_to_key_map[invite_id] = new_key
369+
next_server_index += 1
226370
new_discord_servers += 1
227371
if new_discord_servers % 10 == 0:
228-
print(
229-
f" Added new Discord server URL (from surface): {surface_url}"
230-
)
372+
print(f" Added new Discord server URL (from surface): {surface_url}")
231373

232374
print(f"\nProcessed all {processed_count} accounts")
233375
print(f"Found {new_discord_ids} new Discord IDs")
234376
print(f"Found {new_urls} new URLs")
235377
print(f"Found {new_discord_servers} new Discord server URLs")
236378
print(f"Skipped {invalid_urls_skipped} invalid URLs")
379+
print(f"Skipped {duplicate_count} duplicate Discord invites")
237380

238381
# Write the updated data to files
239382
print("\nWriting updated data to output files...")
@@ -248,20 +391,17 @@ def process_database():
248391

249392
with open(discord_servers_file, "w") as f:
250393
json.dump(discord_servers_data, f, indent=4)
251-
print(
252-
f"Written {len(discord_servers_data)} Discord servers to {discord_servers_file}"
253-
)
394+
print(f"Written {len(discord_servers_data)} Discord servers to {discord_servers_file}")
254395

255396
print("\n" + "=" * 80)
256397
print(f"PROCESS COMPLETE")
257398
print(f"Total Discord IDs: {len(discord_ids_data)} ({new_discord_ids} new)")
258399
print(f"Total URLs: {len(urls_data)} ({new_urls} new)")
259-
print(
260-
f"Total Discord servers: {len(discord_servers_data)} ({new_discord_servers} new)"
261-
)
400+
print(f"Total Discord servers: {len(discord_servers_data)} ({new_discord_servers} new)")
262401
print(f"Total invalid URLs skipped: {invalid_urls_skipped}")
402+
print(f"Total duplicate Discord invites skipped: {duplicate_count + duplicate_invites_skipped}")
263403
print("=" * 80 + "\n")
264404

265405

266406
if __name__ == "__main__":
267-
process_database()
407+
process_database()

0 commit comments

Comments
 (0)