MM_Detection/generate_sample.py at main · Samrudhp/MM_Detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Sample Data Generator
Generates a realistic 500-transaction CSV with embedded fraud patterns:
  - 2 cycles (length 3 and 4)
  - 1 smurfing fan-in cluster
  - 1 layered shell chain

Run: python generate_sample.py
Output: sample_transactions.csv
"""

import csv
import random
from datetime import datetime, timedelta

random.seed(42)
BASE_TIME = datetime(2025, 1, 1, 9, 0, 0)
rows = []
txn_counter = [1]


def next_id():
    tid = f"TXN_{txn_counter[0]:05d}"
    txn_counter[0] += 1
    return tid


def rand_amount(lo=100, hi=50000):
    return round(random.uniform(lo, hi), 2)


def ts(delta_hours=0):
    return (BASE_TIME + timedelta(hours=delta_hours)).strftime("%Y-%m-%d %H:%M:%S")


# ------------------------------------------------------------------ #
# Legitimate background traffic: 400 transactions across 500 accounts #
# (sparse graph → few accidental cycles)                              #
# ------------------------------------------------------------------ #
accounts = [f"ACC_{i:05d}" for i in range(1, 501)]   # 500 distinct accounts

# Generate a DAG-like background by always sending from lower to higher index
# to minimise accidental cycles while still being realistic.
for _ in range(400):
    s = random.choice(accounts[:250])            # senders from first half
    r = random.choice(accounts[250:])            # receivers from second half
    dh = random.uniform(0, 8760)
    rows.append([next_id(), s, r, rand_amount(), ts(dh)])

# ------------------------------------------------------------------ #
# Pattern 1: Cycle of length 3 — ACC_F001 → F002 → F003 → F001       #
# ------------------------------------------------------------------ #
f1, f2, f3 = "ACC_F001", "ACC_F002", "ACC_F003"
for pair in [(f1, f2), (f2, f3), (f3, f1)]:
    rows.append([next_id(), pair[0], pair[1], rand_amount(5000, 10000), ts(10)])

# ------------------------------------------------------------------ #
# Pattern 2: Cycle of length 4 — F004→F005→F006→F007→F004            #
# ------------------------------------------------------------------ #
f4, f5, f6, f7 = "ACC_F004", "ACC_F005", "ACC_F006", "ACC_F007"
for pair in [(f4, f5), (f5, f6), (f6, f7), (f7, f4)]:
    rows.append([next_id(), pair[0], pair[1], rand_amount(2000, 8000), ts(20)])

# ------------------------------------------------------------------ #
# Pattern 3: Smurfing fan-in — 15 senders → SMURF_HUB within 24 h   #
# ------------------------------------------------------------------ #
hub = "ACC_SMURFHUB"
for i in range(15):
    sender = f"ACC_S{i:03d}"
    rows.append([next_id(), sender, hub, rand_amount(500, 2000), ts(50 + i * 0.5)])

# ------------------------------------------------------------------ #
# Pattern 4: Layered shell — ORIGIN → SHELL1 → SHELL2 → BENE         #
# ------------------------------------------------------------------ #
origin, sh1, sh2, bene = "ACC_ORIGIN", "ACC_SH001", "ACC_SH002", "ACC_BENE01"
for pair in [(origin, sh1), (sh1, sh2), (sh2, bene)]:
    rows.append([next_id(), pair[0], pair[1], rand_amount(10000, 30000), ts(70)])

# ------------------------------------------------------------------ #
# Write CSV                                                           #
# ------------------------------------------------------------------ #
output_file = "sample_transactions.csv"
with open(output_file, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["transaction_id", "sender_id", "receiver_id", "amount", "timestamp"])
    writer.writerows(rows)

print(f"Generated {len(rows)} transactions → {output_file}")