@@ -51,7 +51,7 @@ def is_valid_url(url):
5151 # Basic URL validation - check for domain structure
5252 # This simple check ensures the URL has at least something.domain format
5353 has_domain_structure = (
54- re .search (r"[a-zA-Z0-9][\w.-]*\.[a-zA-Z]{2,}" , url ) is not None
54+ re .search (r"[a-zA-Z0-9][\w.-]*\.[a-zA-Z]{2,}" , url ) is not None
5555 )
5656
5757 return has_domain_structure
@@ -62,6 +62,23 @@ def is_discord_url(url):
6262 return is_valid_url (url ) and ("discord.gg" in url or "discord.com" in url )
6363
6464
65+ def extract_discord_invite_id (url ):
66+ """Extract the invite ID from a Discord URL regardless of format.
67+
68+ Handles both discord.gg/INVITEID and discord.com/invite/INVITEID formats.
69+ """
70+ if not is_discord_url (url ):
71+ return None
72+
73+ # Match patterns like discord.gg/INVITEID or discord.com/invite/INVITEID
74+ pattern = r"(?:discord\.gg\/|discord\.com\/invite\/)([a-zA-Z0-9]+)"
75+ match = re .search (pattern , url , re .IGNORECASE )
76+
77+ if match :
78+ return match .group (1 ).lower () # Return the ID in lowercase for consistent comparison
79+ return None
80+
81+
6582def process_database ():
6683 print ("\n " + "=" * 80 )
6784 print ("DISCORD DATABASE FILTER TOOL - STARTING PROCESS" )
@@ -91,6 +108,7 @@ def process_database():
91108 new_urls = 0
92109 new_discord_servers = 0
93110 invalid_urls_skipped = 0
111+ duplicate_invites_skipped = 0
94112
95113 # Load existing data if files exist
96114 discord_ids_data = {}
@@ -135,24 +153,111 @@ def process_database():
135153 f"No existing Discord servers file found, will create new file: { discord_servers_file } "
136154 )
137155
138- # Clean existing data - remove any invalid URLs that might have been previously added
139- urls_before_cleaning = len (urls_data )
140- discord_servers_before_cleaning = len (discord_servers_data )
156+ # Convert old format to new format if needed
157+ # The old format uses URLs as keys, the new format uses DISCORD_SERVER_X as keys
158+ converted_discord_servers = {}
159+
160+ # Check if we need to convert the format (if any URL is used as a key)
161+ needs_conversion = False
162+ for key in discord_servers_data :
163+ if is_discord_url (key ):
164+ needs_conversion = True
165+ break
166+
167+ # Create a mapping of invite IDs to server keys for deduplication
168+ invite_id_to_key_map = {}
169+
170+ if needs_conversion :
171+ print ("\n Converting Discord servers data to new format..." )
172+ server_index = 1
173+
174+ for key , data in discord_servers_data .items ():
175+ if is_discord_url (key ):
176+ # This is in the old format, extract the invite ID
177+ invite_id = extract_discord_invite_id (key )
178+
179+ if invite_id and invite_id in invite_id_to_key_map :
180+ # Skip this duplicate
181+ duplicate_invites_skipped += 1
182+ continue
183+
184+ # Create new entry in the new format
185+ new_key = f"DISCORD_SERVER_{ server_index } "
186+ converted_discord_servers [new_key ] = {
187+ "INVITE_URL" : key ,
188+ "FOUND_ON" : data .get ("FOUND_ON" , 0 ),
189+ "SERVER_ID" : "UNKNOWN" ,
190+ "REASON" : "UNKNOWN"
191+ }
141192
142- urls_data = {url : data for url , data in urls_data .items () if is_valid_url (url )}
143- discord_servers_data = {
144- url : data for url , data in discord_servers_data .items () if is_valid_url (url )
145- }
193+ if invite_id :
194+ invite_id_to_key_map [invite_id ] = new_key
195+ server_index += 1
196+ else :
197+ # This is already in the new format, keep it but check for duplicates
198+ if isinstance (data , dict ) and "INVITE_URL" in data :
199+ invite_id = extract_discord_invite_id (data ["INVITE_URL" ])
200+
201+ if invite_id :
202+ if invite_id in invite_id_to_key_map :
203+ # This is a duplicate, skip it
204+ duplicate_invites_skipped += 1
205+ continue
206+ invite_id_to_key_map [invite_id ] = key
146207
208+ converted_discord_servers [key ] = data
209+
210+ if duplicate_invites_skipped > 0 :
211+ print (f"Skipped { duplicate_invites_skipped } duplicate Discord invites during conversion" )
212+
213+ print (f"Converted { len (converted_discord_servers )} Discord servers to new format" )
214+ discord_servers_data = converted_discord_servers
215+ else :
216+ # Build the invite ID to key map for deduplication
217+ for key , data in discord_servers_data .items ():
218+ if isinstance (data , dict ) and "INVITE_URL" in data :
219+ invite_id = extract_discord_invite_id (data ["INVITE_URL" ])
220+ if invite_id :
221+ invite_id_to_key_map [invite_id ] = key
222+
223+ # Deduplicate existing entries
224+ if not needs_conversion : # Only if we didn't already deduplicate during conversion
225+ print ("\n Checking for duplicate Discord invites in existing data..." )
226+ keys_to_remove = set ()
227+
228+ # First pass: identify duplicates
229+ seen_invite_ids = set ()
230+ for key , data in discord_servers_data .items ():
231+ if isinstance (data , dict ) and "INVITE_URL" in data :
232+ invite_id = extract_discord_invite_id (data ["INVITE_URL" ])
233+ if invite_id :
234+ if invite_id in seen_invite_ids :
235+ keys_to_remove .add (key )
236+ duplicate_invites_skipped += 1
237+ else :
238+ seen_invite_ids .add (invite_id )
239+
240+ # Second pass: remove duplicates
241+ for key in keys_to_remove :
242+ del discord_servers_data [key ]
243+
244+ if duplicate_invites_skipped > 0 :
245+ print (f"Removed { duplicate_invites_skipped } duplicate Discord invites from existing data" )
246+ # Rebuild the mapping after deduplication
247+ invite_id_to_key_map = {}
248+ for key , data in discord_servers_data .items ():
249+ if isinstance (data , dict ) and "INVITE_URL" in data :
250+ invite_id = extract_discord_invite_id (data ["INVITE_URL" ])
251+ if invite_id :
252+ invite_id_to_key_map [invite_id ] = key
253+
254+ # Clean existing data - remove any invalid URLs
255+ urls_before_cleaning = len (urls_data )
256+ urls_data = {url : data for url , data in urls_data .items () if is_valid_url (url )}
147257 cleaned_urls = urls_before_cleaning - len (urls_data )
148- cleaned_discord_servers = discord_servers_before_cleaning - len (
149- discord_servers_data
150- )
151258
152- if cleaned_urls > 0 or cleaned_discord_servers > 0 :
153- print (
154- f"\n Cleaned up { cleaned_urls } invalid URLs and { cleaned_discord_servers } invalid Discord server URLs from existing data"
155- )
259+ if cleaned_urls > 0 :
260+ print (f"\n Cleaned up { cleaned_urls } invalid URLs from existing data" )
156261
157262 # Read and process the main database
158263 try :
@@ -166,6 +271,17 @@ def process_database():
166271
167272 print ("\n Processing database..." )
168273 processed_count = 0
274+ duplicate_count = 0
275+
276+ # Find the next available server index
277+ next_server_index = 1
278+ for key in discord_servers_data .keys ():
279+ if key .startswith ("DISCORD_SERVER_" ):
280+ try :
281+ index = int (key .split ("_" )[2 ])
282+ next_server_index = max (next_server_index , index + 1 )
283+ except (ValueError , IndexError ):
284+ pass
169285
170286 # Process each account
171287 for account_key , account_info in accounts_data .items ():
@@ -185,27 +301,39 @@ def process_database():
185301 "TYPE" : account_type ,
186302 }
187303 new_discord_ids += 1
188- if (
189- new_discord_ids % 10 == 0
190- ): # Log less frequently to avoid excessive output
191- print (
192- f" Added new Discord ID: { discord_id } (Type: { account_type } )"
193- )
304+ if new_discord_ids % 10 == 0 :
305+ print (f" Added new Discord ID: { discord_id } (Type: { account_type } )" )
194306
195307 # Process URLs
196308 final_url = account_info .get ("FINAL_URL" )
309+ found_date = convert_date_to_epoch (account_info .get ("FOUND_ON" , "" ))
197310
198311 if final_url :
199312 if not is_valid_url (final_url ):
200313 invalid_urls_skipped += 1
201314 continue
202315
203- found_date = convert_date_to_epoch (account_info .get ("FOUND_ON" , "" ))
204-
205316 # Check if it's a Discord server URL
206317 if is_discord_url (final_url ):
207- if final_url not in discord_servers_data :
208- discord_servers_data [final_url ] = {"FOUND_ON" : found_date }
318+ # Extract the invite ID to check for duplicates
319+ invite_id = extract_discord_invite_id (final_url )
320+
321+ if invite_id :
322+ if invite_id in invite_id_to_key_map :
323+ # This is a duplicate, skip it
324+ duplicate_count += 1
325+ continue
326+
327+ # Create a new entry in the new format
328+ new_key = f"DISCORD_SERVER_{ next_server_index } "
329+ discord_servers_data [new_key ] = {
330+ "INVITE_URL" : final_url ,
331+ "FOUND_ON" : found_date ,
332+ "SERVER_ID" : "UNKNOWN" ,
333+ "REASON" : "UNKNOWN"
334+ }
335+ invite_id_to_key_map [invite_id ] = new_key
336+ next_server_index += 1
209337 new_discord_servers += 1
210338 if new_discord_servers % 10 == 0 :
211339 print (f" Added new Discord server URL: { final_url } " )
@@ -220,20 +348,35 @@ def process_database():
220348 # Also check SURFACE_URL for Discord links
221349 surface_url = account_info .get ("SURFACE_URL" )
222350 if surface_url and is_valid_url (surface_url ) and is_discord_url (surface_url ):
223- if surface_url not in discord_servers_data :
224- found_date = convert_date_to_epoch (account_info .get ("FOUND_ON" , "" ))
225- discord_servers_data [surface_url ] = {"FOUND_ON" : found_date }
351+ # Extract the invite ID to check for duplicates
352+ invite_id = extract_discord_invite_id (surface_url )
353+
354+ if invite_id :
355+ if invite_id in invite_id_to_key_map :
356+ # This is a duplicate, skip it
357+ duplicate_count += 1
358+ continue
359+
360+ # Create a new entry in the new format
361+ new_key = f"DISCORD_SERVER_{ next_server_index } "
362+ discord_servers_data [new_key ] = {
363+ "INVITE_URL" : surface_url ,
364+ "FOUND_ON" : found_date ,
365+ "SERVER_ID" : "UNKNOWN" ,
366+ "REASON" : "UNKNOWN"
367+ }
368+ invite_id_to_key_map [invite_id ] = new_key
369+ next_server_index += 1
226370 new_discord_servers += 1
227371 if new_discord_servers % 10 == 0 :
228- print (
229- f" Added new Discord server URL (from surface): { surface_url } "
230- )
372+ print (f" Added new Discord server URL (from surface): { surface_url } " )
231373
232374 print (f"\n Processed all { processed_count } accounts" )
233375 print (f"Found { new_discord_ids } new Discord IDs" )
234376 print (f"Found { new_urls } new URLs" )
235377 print (f"Found { new_discord_servers } new Discord server URLs" )
236378 print (f"Skipped { invalid_urls_skipped } invalid URLs" )
379+ print (f"Skipped { duplicate_count } duplicate Discord invites" )
237380
238381 # Write the updated data to files
239382 print ("\n Writing updated data to output files..." )
@@ -248,20 +391,17 @@ def process_database():
248391
249392 with open (discord_servers_file , "w" ) as f :
250393 json .dump (discord_servers_data , f , indent = 4 )
251- print (
252- f"Written { len (discord_servers_data )} Discord servers to { discord_servers_file } "
253- )
394+ print (f"Written { len (discord_servers_data )} Discord servers to { discord_servers_file } " )
254395
255396 print ("\n " + "=" * 80 )
256397 print (f"PROCESS COMPLETE" )
257398 print (f"Total Discord IDs: { len (discord_ids_data )} ({ new_discord_ids } new)" )
258399 print (f"Total URLs: { len (urls_data )} ({ new_urls } new)" )
259- print (
260- f"Total Discord servers: { len (discord_servers_data )} ({ new_discord_servers } new)"
261- )
400+ print (f"Total Discord servers: { len (discord_servers_data )} ({ new_discord_servers } new)" )
262401 print (f"Total invalid URLs skipped: { invalid_urls_skipped } " )
402+ print (f"Total duplicate Discord invites skipped: { duplicate_count + duplicate_invites_skipped } " )
263403 print ("=" * 80 + "\n " )
264404
265405
266406if __name__ == "__main__" :
267- process_database ()
407+ process_database ()
0 commit comments