Skip to content

Commit 928b612

Browse files
committed
Adds ofac extracts to aml demo
1 parent f076db1 commit 928b612

File tree

10 files changed

+433864
-430027
lines changed

10 files changed

+433864
-430027
lines changed

.data/aml/generate.py

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,34 @@
1010
Faker.seed(42)
1111
random.seed(42)
1212

13+
# Load real OFAC names from split files
14+
def load_ofac_names():
15+
"""Load real OFAC sanctioned names from the split files"""
16+
individuals = []
17+
companies = []
18+
19+
try:
20+
with open('ofac_individuals.txt', 'r', encoding='utf-8') as f:
21+
individuals = [line.strip() for line in f if line.strip()]
22+
print(f"Loaded {len(individuals)} OFAC individuals")
23+
except FileNotFoundError:
24+
print("Warning: ofac_individuals.txt not found.")
25+
26+
try:
27+
with open('ofac_companies.txt', 'r', encoding='utf-8') as f:
28+
companies = [line.strip() for line in f if line.strip()]
29+
print(f"Loaded {len(companies)} OFAC companies")
30+
except FileNotFoundError:
31+
print("Warning: ofac_companies.txt not found.")
32+
33+
if not individuals and not companies:
34+
print("Warning: No OFAC files found. Using fallback fake names.")
35+
36+
return individuals, companies
37+
38+
# Load OFAC names at startup
39+
OFAC_INDIVIDUALS, OFAC_COMPANIES = load_ofac_names()
40+
1341
# Constants
1442
CURRENT_DATE = datetime.today()
1543
COUNTRIES = ["USA", "Canada", "UK", "Germany", "South Africa", "Cuba", "Iran", "Russia"]
@@ -128,7 +156,6 @@ def generate_customers(num_customers=50):
128156
for i in range(num_customers):
129157
account_id = 10000 + i
130158
is_company = random.random() < 0.2 # 20% companies
131-
name = fake.company() if is_company else fake.name()
132159

133160
# More realistic age distribution
134161
if is_company:
@@ -178,18 +205,34 @@ def generate_customers(num_customers=50):
178205

179206
address = fake.address().replace("\n", ", ")
180207

181-
# Store potential sanctioned entities (some customers will appear in sanctions)
182-
if blacklisted or (nationality in HIGH_RISK_COUNTRIES and random.random() < 0.3):
208+
# Determine if this customer should be sanctioned and use OFAC name if so
209+
will_be_sanctioned = blacklisted or (nationality in HIGH_RISK_COUNTRIES and random.random() < 0.3)
210+
211+
if will_be_sanctioned:
212+
# Use real OFAC names for customers that will appear in sanctions
213+
if is_company and OFAC_COMPANIES:
214+
name = random.choice(OFAC_COMPANIES)
215+
print(f"OFAC Company - {name}")
216+
elif not is_company and OFAC_INDIVIDUALS:
217+
name = random.choice(OFAC_INDIVIDUALS)
218+
print(f"OFAC Individual - {name}")
219+
else:
220+
name = fake.company() if is_company else fake.name() # Fallback to generated name
221+
222+
# Add to sanctioned entities list
183223
sanctioned_entities.append({
184-
"name": name,
224+
"name": name, # Same name as customer record
185225
"address": address,
186226
"entity_type": "company" if is_company else "individual",
187227
"nationality": nationality
188228
})
229+
else:
230+
# Normal customer - use fake generated name
231+
name = fake.company() if is_company else fake.name()
189232

190233
customers.append({
191234
"customer_id": i + 1,
192-
"name": name,
235+
"name": name, # Now uses OFAC name if customer will be sanctioned
193236
"account": account_id,
194237
"dob": dob.strftime("%Y-%m-%d"),
195238
"nationality": nationality,
@@ -202,7 +245,7 @@ def generate_customers(num_customers=50):
202245
# MongoDB accounts - exactly same data but different structure
203246
accounts.append({
204247
"account_id": account_id,
205-
"name": name,
248+
"name": name, # Same name as customer record
206249
"entity_type": "company" if is_company else "individual",
207250
"contact_information": {
208251
"address": address,
@@ -526,7 +569,8 @@ def generate_sars(customers, transactions):
526569
def generate_sanctions(sanctioned_entities, num_additional=50):
527570
sanctions = []
528571

529-
# First, add some of our actual customers to sanctions (creates real relationships)
572+
# TODO
573+
# Consider removing the limit of 20 here to get all sanctioned entities in future
530574
for i, entity in enumerate(sanctioned_entities[:min(20, len(sanctioned_entities))]):
531575
# Use actual customer data for realistic sanctions entries
532576
sanctions.append({
@@ -539,10 +583,18 @@ def generate_sanctions(sanctioned_entities, num_additional=50):
539583
"list_type": random.choices(["OFAC", "UN", "EU"], weights=[50, 30, 20], k=1)[0]
540584
})
541585

542-
# Then add additional fictional sanctioned entities
586+
# Add additional real OFAC sanctioned entities
543587
for i in range(num_additional):
544588
is_company = random.random() < 0.4 # 40% companies in sanctions
545-
name = fake.company() if is_company else fake.name()
589+
590+
# Use real OFAC names when available
591+
if is_company and OFAC_COMPANIES:
592+
name = random.choice(OFAC_COMPANIES)
593+
elif not is_company and OFAC_INDIVIDUALS:
594+
name = random.choice(OFAC_INDIVIDUALS)
595+
else:
596+
# Fallback to fake names if OFAC list not available
597+
name = fake.company() if is_company else fake.name()
546598

547599
# Sanctions more likely from high-risk countries
548600
country = random.choices(HIGH_RISK_COUNTRIES + ["North Korea", "Syria", "Venezuela"],
@@ -627,4 +679,4 @@ def generate_sanctions(sanctioned_entities, num_additional=50):
627679
print(f" - {sum(1 for t in pg_transactions if t['is_laundering']):,} suspicious transactions")
628680
print(f" - {len(sars):,} SARs generated")
629681
print(f" - {len(sanctions):,} sanctions entries")
630-
print(f" - Files saved to postgres/ and mongo_seed/ directories")
682+
print(f" - Files saved to postgres/ and mongo_seed/ directories")

0 commit comments

Comments
 (0)