-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpopulate_exercise_table.py
executable file
Β·94 lines (69 loc) Β· 2.6 KB
/
populate_exercise_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
import os
import sys
import hashlib
import pandas as pd
import numpy as np
import sqlite3
import json
DB_FILE = "../taskpool.db"
def load_similar_words_data(similar_words_path: str):
df = pd.read_csv(similar_words_path, sep='\t')
def remove_nans(similar_words):
return list(filter(lambda x: isinstance(x, str), similar_words))
result = []
for i in range(len(df.index)):
row = df.iloc[i]
filtered = remove_nans(row.to_list()[1:])
result.append([row["word"], json.dumps(filtered)])
return pd.DataFrame(result, columns=["word", "similar_words"])
def load_exercise_data(exercise_path: str) -> pd.DataFrame:
df = pd.read_csv(exercise_path,
usecols=[
"word",
"translation_id",
"target_sentence_id",
"source_sentence_id"
],
dtype={
"word": 'string',
"translation_id": np.int64,
"target_sentence_id": np.int64,
"source_sentence_id": np.int64
}, sep='\t', skiprows=0)
def generate_id(row):
return hashlib.md5("{}_{}".format(row["source_sentence_id"], row["target_sentence_id"]).encode("utf-8")) \
.hexdigest()
# add id
df['id'] = pd.Series([generate_id(df.iloc[i]) for i in range(len(df.index))], dtype='string')
return df
def find_or_exit(path):
if not os.path.exists(path):
print("cannot find", path)
exit(1)
def main():
print("Preparing data...")
# default path
exercise_path = "data-import/exercise-import.tsv"
similar_words_path = "data-import/similar-words-import.tsv"
if len(sys.argv) >= 2:
exercise_path = sys.argv[1]
if len(sys.argv) >= 3:
similar_words_path = sys.argv[2]
find_or_exit(exercise_path)
find_or_exit(similar_words_path)
print("Reading exercise data from ", exercise_path)
print("Reading similar words data from", similar_words_path)
df1 = load_similar_words_data(similar_words_path)
df2 = load_exercise_data(exercise_path)
print("Merging tables together")
df = pd.merge(df1, df2, on="word").dropna(subset=["word"])
df = df.rename(columns={
"word": "target_word"
})
with sqlite3.connect(DB_FILE) as conn:
print("Uploading {0} values to exercise table...".format(df.shape[0]))
df.to_sql("exercise", conn, if_exists="replace", index=False)
print('Done')
if __name__ == "__main__":
main()