-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerate_sample.py
More file actions
88 lines (70 loc) · 3.51 KB
/
generate_sample.py
File metadata and controls
88 lines (70 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Sample Data Generator
Generates a realistic 500-transaction CSV with embedded fraud patterns:
- 2 cycles (length 3 and 4)
- 1 smurfing fan-in cluster
- 1 layered shell chain
Run: python generate_sample.py
Output: sample_transactions.csv
"""
import csv
import random
from datetime import datetime, timedelta
random.seed(42)
BASE_TIME = datetime(2025, 1, 1, 9, 0, 0)
rows = []
txn_counter = [1]
def next_id():
tid = f"TXN_{txn_counter[0]:05d}"
txn_counter[0] += 1
return tid
def rand_amount(lo=100, hi=50000):
return round(random.uniform(lo, hi), 2)
def ts(delta_hours=0):
return (BASE_TIME + timedelta(hours=delta_hours)).strftime("%Y-%m-%d %H:%M:%S")
# ------------------------------------------------------------------ #
# Legitimate background traffic: 400 transactions across 500 accounts #
# (sparse graph → few accidental cycles) #
# ------------------------------------------------------------------ #
accounts = [f"ACC_{i:05d}" for i in range(1, 501)] # 500 distinct accounts
# Generate a DAG-like background by always sending from lower to higher index
# to minimise accidental cycles while still being realistic.
for _ in range(400):
s = random.choice(accounts[:250]) # senders from first half
r = random.choice(accounts[250:]) # receivers from second half
dh = random.uniform(0, 8760)
rows.append([next_id(), s, r, rand_amount(), ts(dh)])
# ------------------------------------------------------------------ #
# Pattern 1: Cycle of length 3 — ACC_F001 → F002 → F003 → F001 #
# ------------------------------------------------------------------ #
f1, f2, f3 = "ACC_F001", "ACC_F002", "ACC_F003"
for pair in [(f1, f2), (f2, f3), (f3, f1)]:
rows.append([next_id(), pair[0], pair[1], rand_amount(5000, 10000), ts(10)])
# ------------------------------------------------------------------ #
# Pattern 2: Cycle of length 4 — F004→F005→F006→F007→F004 #
# ------------------------------------------------------------------ #
f4, f5, f6, f7 = "ACC_F004", "ACC_F005", "ACC_F006", "ACC_F007"
for pair in [(f4, f5), (f5, f6), (f6, f7), (f7, f4)]:
rows.append([next_id(), pair[0], pair[1], rand_amount(2000, 8000), ts(20)])
# ------------------------------------------------------------------ #
# Pattern 3: Smurfing fan-in — 15 senders → SMURF_HUB within 24 h #
# ------------------------------------------------------------------ #
hub = "ACC_SMURFHUB"
for i in range(15):
sender = f"ACC_S{i:03d}"
rows.append([next_id(), sender, hub, rand_amount(500, 2000), ts(50 + i * 0.5)])
# ------------------------------------------------------------------ #
# Pattern 4: Layered shell — ORIGIN → SHELL1 → SHELL2 → BENE #
# ------------------------------------------------------------------ #
origin, sh1, sh2, bene = "ACC_ORIGIN", "ACC_SH001", "ACC_SH002", "ACC_BENE01"
for pair in [(origin, sh1), (sh1, sh2), (sh2, bene)]:
rows.append([next_id(), pair[0], pair[1], rand_amount(10000, 30000), ts(70)])
# ------------------------------------------------------------------ #
# Write CSV #
# ------------------------------------------------------------------ #
output_file = "sample_transactions.csv"
with open(output_file, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["transaction_id", "sender_id", "receiver_id", "amount", "timestamp"])
writer.writerows(rows)
print(f"Generated {len(rows)} transactions → {output_file}")