-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_dataset.py
63 lines (51 loc) · 2.11 KB
/
generate_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Use this bot to generate a dataset of any Discord user's messages.
# Hint: The more messages you have, the more accurate the AI will be.
# This script uses a bot user instead of a selfbot to reduce the chances of a Discord ban.
# Requires discord.py (or any fork)
# stdlib
import os
# discord.py
import discord
from discord.ext import commands
# third-party
from dotenv import load_dotenv
client = commands.Bot(command_prefix="++")
load_dotenv()
@client.command()
async def fetch(ctx, user: discord.Member, gpt=""):
if gpt and gpt != "gpt":
return await ctx.send(f"Usage: {ctx.prefix}fetch <user> [gpt]\n<> is required, [] is optional")
if os.path.isfile("dataset.txt"):
os.remove("dataset.txt")
if os.path.isfile("gpt_dataset.txt"):
os.remove("gpt_dataset.txt")
msg = await ctx.send("Fetching messages...\nThis will take a while.")
count = 0
name = None
for channel in [channel for channel in ctx.guild.channels if isinstance(channel, discord.TextChannel)]:
try:
async for message in channel.history(limit=None):
count += 1
if count % 500 == 0:
print(f"Processed {count} messages")
if message.author.id == user.id and message.content:
if not gpt:
name = "dataset.txt"
with open(name, "a", encoding="utf-8") as f:
f.write(f"{message.content}\n")
else:
name = "gpt_dataset.txt"
with open(name, "a", encoding="utf-8") as f:
f.write(f"<|startoftext|> {message.content} <|endoftext|>\n")
except discord.Forbidden:
pass
await msg.delete()
await ctx.send(file=discord.File(name))
@client.event
async def on_command_error(ctx, error):
if isinstance(error, commands.BadArgument):
await ctx.send("Please provide a valid user.")
@client.event
async def on_ready():
print("Ready")
client.run(os.environ.get("TOKEN"))