Skip to content

Commit

Permalink
commit
Browse files Browse the repository at this point in the history
  • Loading branch information
shrkvr2024 committed Oct 3, 2024
1 parent af4bdaf commit c764233
Show file tree
Hide file tree
Showing 33 changed files with 117,014 additions and 0 deletions.
9 changes: 9 additions & 0 deletions data_manufacture-main/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
venv/
*.txt
*.csv
bad_spelling_data/*.txt
bad_spelling_data/*.csv
generated_data/
todo.txt
"csv files"
saved_gpt/
13 changes: 13 additions & 0 deletions data_manufacture-main/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# data_manufacture

## how to load tokenizer:

from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_file="name of the .json file of the tokenizer",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
pad_token = "<|endoftext|>"

)
tokenizer=wrapped_tokenizer
47 changes: 47 additions & 0 deletions data_manufacture-main/api_test_with_flask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from flask import Flask,request,jsonify
from transformers import GPT2LMHeadModel,PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_file="tokenizer_BPE3.json",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",

)
tokenizer=wrapped_tokenizer
tokenizer.pad_token_id = tokenizer.eos_token_id

model=GPT2LMHeadModel.from_pretrained('testsample30',pad_token_id=tokenizer.eos_token_id)


model.eval()


app=Flask(__name__)



@app.route("/wordcorrector",methods=["POST"])

def word_correct():
#if request.method == 'POST'
jdata=request.get_json()
print(jdata)
data=jdata['data']
print(data)
tokenized_sequence=tokenizer(data,return_tensors='pt')
input_ids=tokenized_sequence.input_ids
gen_tokens = model.generate(
input_ids,
do_sample=True,
temperature=0.9,
max_length=15,)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
correctname=gen_text[len(data)+1:]
e=correctname.split('\n')[0]
response={"generated text":e}
print(gen_text)
print(correctname)
return jsonify(response),201


if __name__ == "__main__":
app.run(debug=True)
102 changes: 102 additions & 0 deletions data_manufacture-main/bad_spelling_data/more_less.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pandas as pd
import random
import csv
import word_shuffler

alphabet = ['ا', 'ب', 'پ', 'ت', 'ث','ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه', 'ی']
surnames = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"]
names = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"]


def generate_word(name_size: int):
random_name = random.choices(alphabet, k=name_size)
return ''.join(random_name)


def count_part(name):
splited_name = name.split()
return len(splited_name)

def less_letter(name):
index = random.randint(0, len(name) - 1)
name.replace(name[index], '')
return name

def more_letter(name):
random_letter = random.choices(alphabet, k=1)[0]
random_index = random.randint(0, len(name) - 1)
list_name = list(name)
list_name.insert(random_index, random_letter)
return ''.join(list_name)

# generates wrong names
with open("more_name.txt", 'wt') as file:
for name in list(names):
if count_part(name) == 1:
more_word = more_letter(name)
file.write(more_word + "\n")


elif count_part(name) == 2:
more_word = more_letter(name[0]) + more_letter(name[1])
file.write(more_word + "\n")

elif count_part(name) == 3:
more_word = more_letter(name[0]) + more_letter(name[1]) + more_letter(name[2])
file.write(more_word + "\n")

file.close()

# generates wrong surnames
with open("more_surname.txt", 'wt') as file:
for surname in list(surnames):
if count_part(surname) == 1:
more_word = more_letter(surname)
file.write(more_word + "\n")


elif count_part(surname) == 2:
more_word = more_letter(surname[0]) + more_letter(surname[1])
file.write(more_word + "\n")

elif count_part(surname) == 3:
more_word = more_letter(surname[0]) + more_letter(surname[1]) + more_letter(surname[2])
file.write(more_word + "\n")

file.close()


# with open("less_name.txt", 'wt') as file:
# for name in list(names):
# if count_part(name) == 1:
# less_word = less_letter(name)
# file.write(less_word + "\n")


# elif count_part(name) == 2:
# less_word = less_letter(name[0]) + less_letter(name[1])
# file.write(less_word + "\n")

# elif count_part(name) == 3:
# less_word = less_letter(name[0]) + less_letter(name[1]) + less_letter(name[2])
# file.write(less_word + "\n")

# file.close()

# # generates wrong surnames
# with open("less_surname.txt", 'wt') as file:
# for surname in list(surnames):
# if count_part(surname) == 1:
# less_word = less_letter(surname)
# file.write(less_word + "\n")


# elif count_part(surname) == 2:
# less_word = less_letter(surname[0]) + less_letter(surname[1])
# file.write(less_word + "\n")

# elif count_part(surname) == 3:
# less_word = less_letter(surname[0]) + less_letter(surname[1]) + less_letter(surname[2])
# file.write(less_word + "\n")

# file.close()
68 changes: 68 additions & 0 deletions data_manufacture-main/bad_spelling_data/replace_letter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pandas as pd
import random
import csv
import word_shuffler

alphabet = ['ا', 'ب', 'پ', 'ت', 'ث','ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه', 'ی']
surnames = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"]
names = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"]


def generate_word(name_size: int):
random_name = random.choices(alphabet, k=name_size)
return ''.join(random_name)


def count_part(name):
splited_name = name.split()
return len(splited_name)

def replace_letter(name):
index = random.randint(0, len(name) - 1)
random_letter = random.choices(alphabet, k=1)[0]
if name[index] is not random_letter:
name.replace(name[index], random_letter)
return name
return False

# generates wrong names
with open("replace_letter_name.txt", 'wt') as file:
for name in list(names):
if count_part(name) == 1:
new_word = replace_letter(name)
if new_word:
file.write(new_word + "\n")

elif count_part(name) == 2:
new_word = replace_letter(name[0]) + replace_letter(name[1])
if new_word:
file.write(new_word + "\n")

elif count_part(name) == 3:
new_word = replace_letter(name[0]) + replace_letter(name[1])\
+ replace_letter(name[2])
if new_word:
file.write(new_word + "\n")

file.close()

# generates wrong surnames
with open("replace_letter_surname.txt", 'wt') as file:
for surname in list(surnames):
if count_part(surname) == 1:
new_word = replace_letter(surname)
if new_word:
file.write(new_word + "\n")

elif count_part(surname) == 2:
new_word = replace_letter(surname[0]) + replace_letter(surname[1])
if new_word:
file.write(new_word + "\n")

elif count_part(surname) == 3:
new_word = replace_letter(surname[0]) + replace_letter(surname[1])\
+ replace_letter(surname[2])
if new_word:
file.write(new_word + "\n")

file.close()
18 changes: 18 additions & 0 deletions data_manufacture-main/bad_spelling_data/reversed_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

surname_data = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"]
name_data = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"]
surname_data.drop_duplicates()
name_data.drop_duplicates()

# sur_select = surname_data.loc[surname_data['Names']]
# name_select = name_data.loc[name_data['Names'] > 10]
print(surname_data)

with open("reverced_name.txt", 'wt') as f:
for i in surname_data:
for j in name_data:
print(i + " " + j)
f.write(i + " " + j + "\n")
f.close()
# print(sur_select[1])
20 changes: 20 additions & 0 deletions data_manufacture-main/bad_spelling_data/word_shuffler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
from random import shuffle
def shuffle_word (word):
word=list(word)
shuffle(word)
return ''.join(word)


surname_data = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"]
name_data = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"]
surname_data.drop_duplicates()
name_data.drop_duplicates()

with open("word_shuffled.txt", 'wt') as f:
for i in surname_data:
for j in name_data:
print(shuffle_word(i) + " " + shuffle_word(j))
f.write(shuffle_word(i) + " " + shuffle_word(j) + "\n")
f.close()

41 changes: 41 additions & 0 deletions data_manufacture-main/bad_spelling_data/worng_letter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
import random
zs="ضظز"
s_letters="سثص"
def change_letters(word):
for i in word :
g=random.randint(0,1)
d=random.randint(0,2)
f=random.randint(0,3)
if 'ز' in word:
newword=word.replace('ز',zs[d])
elif 'ظ' in word:
newword=word.replace('ظ',zs[d])
elif 'ض' in word:
newword=word.replace('ض',zs[d])
elif 'س' in word:
newword=word.replace('س',s_letters[d])
elif 'ث' in word:
newword=word.replace('ث',s_letters[d])
elif 'ص' in word:
newword=word.replace('ص',s_letters[d])
elif 'ط' in word:
newword=word.replace('ط','ت')
elif 'ت' in word :
newword=word.replace('ت','ط')
else:
return word
return newword


surname_data = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"]
name_data = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"]
surname_data.drop_duplicates()
name_data.drop_duplicates()

with open("final_wrong_letter.txt", 'wt') as f:
for i in surname_data:
for j in name_data:
print(change_letters(i) + " " + change_letters(j))
f.write(change_letters(i) + " " + change_letters(j) + "\n")
f.close()
54 changes: 54 additions & 0 deletions data_manufacture-main/bad_spelling_data/wrong_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pandas as pd
import random
import csv
import word_shuffler

alphabet = ['ا', 'ب', 'پ', 'ت', 'ث','ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه', 'ی']
surnames = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"]
names = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"]


def generate_word(name_size: int):
random_name = random.choices(alphabet, k=name_size)
return ''.join(random_name)


def count_part(name):
splited_name = name.split()
return len(splited_name)

# generates wrong names
with open("name_wrong.txt", 'wt') as file:
for name in list(names):
if count_part(name) == 1:
new_word = generate_word(len(name))
file.write(new_word + "\n")

elif count_part(name) == 2:
new_word = generate_word(len(name[0])) + generate_word(len(name[1]))
file.write(new_word + "\n")

elif count_part(name) == 3:
new_word = generate_word(len(name[0])) + generate_word(len(name[1]))\
+ generate_word(len(name[2]))
file.write(new_word + "\n")

file.close()

# generates wrong surnames
with open("surname_wrong.txt", 'wt') as file:
for surname in list(surnames):
if count_part(surname) == 1:
new_word = generate_word(len(surname))
file.write(new_word + "\n")

elif count_part(surname) == 2:
new_word = generate_word(len(surname[0])) + generate_word(len(surname[1]))
file.write(new_word + "\n")

elif count_part(surname) == 3:
new_word = generate_word(len(surname[0])) + generate_word(len(surname[1]))\
+ generate_word(len(surname[2]))
file.write(new_word + "\n")

file.close()
Loading

0 comments on commit c764233

Please sign in to comment.