Skip to content

Commit 0a4ed9d

Browse files
committed
removes dupes
1 parent d4c4dad commit 0a4ed9d

File tree

7 files changed

+405730
-406478
lines changed

7 files changed

+405730
-406478
lines changed

.data/aml/generate.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,32 @@ def load_ofac_names():
3838
# Load OFAC names at startup
3939
OFAC_INDIVIDUALS, OFAC_COMPANIES = load_ofac_names()
4040

41+
# Track used OFAC names to prevent duplicates
42+
USED_OFAC_INDIVIDUALS = set()
43+
USED_OFAC_COMPANIES = set()
44+
45+
def reset_ofac_tracking():
46+
"""Reset the tracking of used OFAC names for a fresh generation run"""
47+
global USED_OFAC_INDIVIDUALS, USED_OFAC_COMPANIES
48+
USED_OFAC_INDIVIDUALS.clear()
49+
USED_OFAC_COMPANIES.clear()
50+
51+
def get_unique_ofac_name(is_company=False):
52+
"""Get a unique OFAC name that hasn't been used before"""
53+
if is_company:
54+
available_names = [name for name in OFAC_COMPANIES if name not in USED_OFAC_COMPANIES]
55+
if available_names:
56+
name = random.choice(available_names)
57+
USED_OFAC_COMPANIES.add(name)
58+
return name
59+
else:
60+
available_names = [name for name in OFAC_INDIVIDUALS if name not in USED_OFAC_INDIVIDUALS]
61+
if available_names:
62+
name = random.choice(available_names)
63+
USED_OFAC_INDIVIDUALS.add(name)
64+
return name
65+
return None # No unique names available
66+
4167
# Constants
4268
CURRENT_DATE = datetime.today()
4369
COUNTRIES = ["USA", "Canada", "UK", "Germany", "South Africa", "Cuba", "Iran", "Russia"]
@@ -210,14 +236,13 @@ def generate_customers(num_customers=50):
210236

211237
if will_be_sanctioned:
212238
# Use real OFAC names for customers that will appear in sanctions
213-
if is_company and OFAC_COMPANIES:
214-
name = random.choice(OFAC_COMPANIES)
215-
print(f"OFAC Company - {name}")
216-
elif not is_company and OFAC_INDIVIDUALS:
217-
name = random.choice(OFAC_INDIVIDUALS)
218-
print(f"OFAC Individual - {name}")
239+
ofac_name = get_unique_ofac_name(is_company)
240+
if ofac_name:
241+
name = ofac_name
242+
print(f"OFAC {'Company' if is_company else 'Individual'} - {name}")
219243
else:
220244
name = fake.company() if is_company else fake.name() # Fallback to generated name
245+
print(f"Warning: No unique OFAC {'company' if is_company else 'individual'} names available, using fake name")
221246

222247
# Add to sanctioned entities list
223248
sanctioned_entities.append({
@@ -588,12 +613,11 @@ def generate_sanctions(sanctioned_entities, num_additional=50):
588613
is_company = random.random() < 0.4 # 40% companies in sanctions
589614

590615
# Use real OFAC names when available
591-
if is_company and OFAC_COMPANIES:
592-
name = random.choice(OFAC_COMPANIES)
593-
elif not is_company and OFAC_INDIVIDUALS:
594-
name = random.choice(OFAC_INDIVIDUALS)
616+
ofac_name = get_unique_ofac_name(is_company)
617+
if ofac_name:
618+
name = ofac_name
595619
else:
596-
# Fallback to fake names if OFAC list not available
620+
# Fallback to fake names if no unique OFAC names available
597621
name = fake.company() if is_company else fake.name()
598622

599623
# Sanctions more likely from high-risk countries
@@ -637,6 +661,7 @@ def generate_sanctions(sanctioned_entities, num_additional=50):
637661

638662
# Main execution
639663
print("Starting AML data generation...")
664+
reset_ofac_tracking() # Reset OFAC name tracking for fresh generation
640665
print("Generating 5,000 customers and accounts...")
641666
customers, accounts, sanctioned_entities = generate_customers(5000)
642667
print(f"Generated {len(customers)} customers, {len(accounts)} accounts, {len(sanctioned_entities)} potential sanctioned entities")

0 commit comments

Comments
 (0)