-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
af4bdaf
commit c764233
Showing
33 changed files
with
117,014 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
venv/ | ||
*.txt | ||
*.csv | ||
bad_spelling_data/*.txt | ||
bad_spelling_data/*.csv | ||
generated_data/ | ||
todo.txt | ||
"csv files" | ||
saved_gpt/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# data_manufacture | ||
|
||
## how to load tokenizer: | ||
|
||
from transformers import PreTrainedTokenizerFast | ||
wrapped_tokenizer = PreTrainedTokenizerFast( | ||
tokenizer_file="name of the .json file of the tokenizer", | ||
bos_token="<|endoftext|>", | ||
eos_token="<|endoftext|>", | ||
pad_token = "<|endoftext|>" | ||
|
||
) | ||
tokenizer=wrapped_tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from flask import Flask,request,jsonify | ||
from transformers import GPT2LMHeadModel,PreTrainedTokenizerFast | ||
wrapped_tokenizer = PreTrainedTokenizerFast( | ||
tokenizer_file="tokenizer_BPE3.json", | ||
bos_token="<|endoftext|>", | ||
eos_token="<|endoftext|>", | ||
|
||
) | ||
tokenizer=wrapped_tokenizer | ||
tokenizer.pad_token_id = tokenizer.eos_token_id | ||
|
||
model=GPT2LMHeadModel.from_pretrained('testsample30',pad_token_id=tokenizer.eos_token_id) | ||
|
||
|
||
model.eval() | ||
|
||
|
||
app=Flask(__name__) | ||
|
||
|
||
|
||
@app.route("/wordcorrector",methods=["POST"]) | ||
|
||
def word_correct(): | ||
#if request.method == 'POST' | ||
jdata=request.get_json() | ||
print(jdata) | ||
data=jdata['data'] | ||
print(data) | ||
tokenized_sequence=tokenizer(data,return_tensors='pt') | ||
input_ids=tokenized_sequence.input_ids | ||
gen_tokens = model.generate( | ||
input_ids, | ||
do_sample=True, | ||
temperature=0.9, | ||
max_length=15,) | ||
gen_text = tokenizer.batch_decode(gen_tokens)[0] | ||
correctname=gen_text[len(data)+1:] | ||
e=correctname.split('\n')[0] | ||
response={"generated text":e} | ||
print(gen_text) | ||
print(correctname) | ||
return jsonify(response),201 | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import pandas as pd | ||
import random | ||
import csv | ||
import word_shuffler | ||
|
||
alphabet = ['ا', 'ب', 'پ', 'ت', 'ث','ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه', 'ی'] | ||
surnames = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"] | ||
names = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"] | ||
|
||
|
||
def generate_word(name_size: int): | ||
random_name = random.choices(alphabet, k=name_size) | ||
return ''.join(random_name) | ||
|
||
|
||
def count_part(name): | ||
splited_name = name.split() | ||
return len(splited_name) | ||
|
||
def less_letter(name): | ||
index = random.randint(0, len(name) - 1) | ||
name.replace(name[index], '') | ||
return name | ||
|
||
def more_letter(name): | ||
random_letter = random.choices(alphabet, k=1)[0] | ||
random_index = random.randint(0, len(name) - 1) | ||
list_name = list(name) | ||
list_name.insert(random_index, random_letter) | ||
return ''.join(list_name) | ||
|
||
# generates wrong names | ||
with open("more_name.txt", 'wt') as file: | ||
for name in list(names): | ||
if count_part(name) == 1: | ||
more_word = more_letter(name) | ||
file.write(more_word + "\n") | ||
|
||
|
||
elif count_part(name) == 2: | ||
more_word = more_letter(name[0]) + more_letter(name[1]) | ||
file.write(more_word + "\n") | ||
|
||
elif count_part(name) == 3: | ||
more_word = more_letter(name[0]) + more_letter(name[1]) + more_letter(name[2]) | ||
file.write(more_word + "\n") | ||
|
||
file.close() | ||
|
||
# generates wrong surnames | ||
with open("more_surname.txt", 'wt') as file: | ||
for surname in list(surnames): | ||
if count_part(surname) == 1: | ||
more_word = more_letter(surname) | ||
file.write(more_word + "\n") | ||
|
||
|
||
elif count_part(surname) == 2: | ||
more_word = more_letter(surname[0]) + more_letter(surname[1]) | ||
file.write(more_word + "\n") | ||
|
||
elif count_part(surname) == 3: | ||
more_word = more_letter(surname[0]) + more_letter(surname[1]) + more_letter(surname[2]) | ||
file.write(more_word + "\n") | ||
|
||
file.close() | ||
|
||
|
||
# with open("less_name.txt", 'wt') as file: | ||
# for name in list(names): | ||
# if count_part(name) == 1: | ||
# less_word = less_letter(name) | ||
# file.write(less_word + "\n") | ||
|
||
|
||
# elif count_part(name) == 2: | ||
# less_word = less_letter(name[0]) + less_letter(name[1]) | ||
# file.write(less_word + "\n") | ||
|
||
# elif count_part(name) == 3: | ||
# less_word = less_letter(name[0]) + less_letter(name[1]) + less_letter(name[2]) | ||
# file.write(less_word + "\n") | ||
|
||
# file.close() | ||
|
||
# # generates wrong surnames | ||
# with open("less_surname.txt", 'wt') as file: | ||
# for surname in list(surnames): | ||
# if count_part(surname) == 1: | ||
# less_word = less_letter(surname) | ||
# file.write(less_word + "\n") | ||
|
||
|
||
# elif count_part(surname) == 2: | ||
# less_word = less_letter(surname[0]) + less_letter(surname[1]) | ||
# file.write(less_word + "\n") | ||
|
||
# elif count_part(surname) == 3: | ||
# less_word = less_letter(surname[0]) + less_letter(surname[1]) + less_letter(surname[2]) | ||
# file.write(less_word + "\n") | ||
|
||
# file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import pandas as pd | ||
import random | ||
import csv | ||
import word_shuffler | ||
|
||
alphabet = ['ا', 'ب', 'پ', 'ت', 'ث','ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه', 'ی'] | ||
surnames = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"] | ||
names = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"] | ||
|
||
|
||
def generate_word(name_size: int): | ||
random_name = random.choices(alphabet, k=name_size) | ||
return ''.join(random_name) | ||
|
||
|
||
def count_part(name): | ||
splited_name = name.split() | ||
return len(splited_name) | ||
|
||
def replace_letter(name): | ||
index = random.randint(0, len(name) - 1) | ||
random_letter = random.choices(alphabet, k=1)[0] | ||
if name[index] is not random_letter: | ||
name.replace(name[index], random_letter) | ||
return name | ||
return False | ||
|
||
# generates wrong names | ||
with open("replace_letter_name.txt", 'wt') as file: | ||
for name in list(names): | ||
if count_part(name) == 1: | ||
new_word = replace_letter(name) | ||
if new_word: | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(name) == 2: | ||
new_word = replace_letter(name[0]) + replace_letter(name[1]) | ||
if new_word: | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(name) == 3: | ||
new_word = replace_letter(name[0]) + replace_letter(name[1])\ | ||
+ replace_letter(name[2]) | ||
if new_word: | ||
file.write(new_word + "\n") | ||
|
||
file.close() | ||
|
||
# generates wrong surnames | ||
with open("replace_letter_surname.txt", 'wt') as file: | ||
for surname in list(surnames): | ||
if count_part(surname) == 1: | ||
new_word = replace_letter(surname) | ||
if new_word: | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(surname) == 2: | ||
new_word = replace_letter(surname[0]) + replace_letter(surname[1]) | ||
if new_word: | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(surname) == 3: | ||
new_word = replace_letter(surname[0]) + replace_letter(surname[1])\ | ||
+ replace_letter(surname[2]) | ||
if new_word: | ||
file.write(new_word + "\n") | ||
|
||
file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import pandas as pd | ||
|
||
surname_data = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"] | ||
name_data = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"] | ||
surname_data.drop_duplicates() | ||
name_data.drop_duplicates() | ||
|
||
# sur_select = surname_data.loc[surname_data['Names']] | ||
# name_select = name_data.loc[name_data['Names'] > 10] | ||
print(surname_data) | ||
|
||
with open("reverced_name.txt", 'wt') as f: | ||
for i in surname_data: | ||
for j in name_data: | ||
print(i + " " + j) | ||
f.write(i + " " + j + "\n") | ||
f.close() | ||
# print(sur_select[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import pandas as pd | ||
from random import shuffle | ||
def shuffle_word (word): | ||
word=list(word) | ||
shuffle(word) | ||
return ''.join(word) | ||
|
||
|
||
surname_data = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"] | ||
name_data = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"] | ||
surname_data.drop_duplicates() | ||
name_data.drop_duplicates() | ||
|
||
with open("word_shuffled.txt", 'wt') as f: | ||
for i in surname_data: | ||
for j in name_data: | ||
print(shuffle_word(i) + " " + shuffle_word(j)) | ||
f.write(shuffle_word(i) + " " + shuffle_word(j) + "\n") | ||
f.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import pandas as pd | ||
import random | ||
zs="ضظز" | ||
s_letters="سثص" | ||
def change_letters(word): | ||
for i in word : | ||
g=random.randint(0,1) | ||
d=random.randint(0,2) | ||
f=random.randint(0,3) | ||
if 'ز' in word: | ||
newword=word.replace('ز',zs[d]) | ||
elif 'ظ' in word: | ||
newword=word.replace('ظ',zs[d]) | ||
elif 'ض' in word: | ||
newword=word.replace('ض',zs[d]) | ||
elif 'س' in word: | ||
newword=word.replace('س',s_letters[d]) | ||
elif 'ث' in word: | ||
newword=word.replace('ث',s_letters[d]) | ||
elif 'ص' in word: | ||
newword=word.replace('ص',s_letters[d]) | ||
elif 'ط' in word: | ||
newword=word.replace('ط','ت') | ||
elif 'ت' in word : | ||
newword=word.replace('ت','ط') | ||
else: | ||
return word | ||
return newword | ||
|
||
|
||
surname_data = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"] | ||
name_data = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"] | ||
surname_data.drop_duplicates() | ||
name_data.drop_duplicates() | ||
|
||
with open("final_wrong_letter.txt", 'wt') as f: | ||
for i in surname_data: | ||
for j in name_data: | ||
print(change_letters(i) + " " + change_letters(j)) | ||
f.write(change_letters(i) + " " + change_letters(j) + "\n") | ||
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import pandas as pd | ||
import random | ||
import csv | ||
import word_shuffler | ||
|
||
alphabet = ['ا', 'ب', 'پ', 'ت', 'ث','ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ه', 'ی'] | ||
surnames = pd.read_csv("~/personal/correct_dataset/surname_2.csv")["Names"] | ||
names = pd.read_csv("~/personal/correct_dataset/names.csv")["Names"] | ||
|
||
|
||
def generate_word(name_size: int): | ||
random_name = random.choices(alphabet, k=name_size) | ||
return ''.join(random_name) | ||
|
||
|
||
def count_part(name): | ||
splited_name = name.split() | ||
return len(splited_name) | ||
|
||
# generates wrong names | ||
with open("name_wrong.txt", 'wt') as file: | ||
for name in list(names): | ||
if count_part(name) == 1: | ||
new_word = generate_word(len(name)) | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(name) == 2: | ||
new_word = generate_word(len(name[0])) + generate_word(len(name[1])) | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(name) == 3: | ||
new_word = generate_word(len(name[0])) + generate_word(len(name[1]))\ | ||
+ generate_word(len(name[2])) | ||
file.write(new_word + "\n") | ||
|
||
file.close() | ||
|
||
# generates wrong surnames | ||
with open("surname_wrong.txt", 'wt') as file: | ||
for surname in list(surnames): | ||
if count_part(surname) == 1: | ||
new_word = generate_word(len(surname)) | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(surname) == 2: | ||
new_word = generate_word(len(surname[0])) + generate_word(len(surname[1])) | ||
file.write(new_word + "\n") | ||
|
||
elif count_part(surname) == 3: | ||
new_word = generate_word(len(surname[0])) + generate_word(len(surname[1]))\ | ||
+ generate_word(len(surname[2])) | ||
file.write(new_word + "\n") | ||
|
||
file.close() |
Oops, something went wrong.