-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
134 lines (110 loc) · 4.72 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
import pandas as pd
from string import punctuation
import re
import nltk
import random
# nltk.download('stopwords')
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# from nltk.tokenize import TweetTokenizer
# from sklearn.preprocessing import LabelEncoder
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, LSTM, Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from sklearn.feature_extraction.text import TfidfVectorizer
# import matplotlib.pyplot as plt
# import seaborn as sns
# from nltk.tokenize import word_tokenize
# nltk.download("punkt")
from tokenizer import tokenize
df = pd.read_csv('train-sample.csv')
def tok(str):
maxlen = 512
tokenizer = Tokenizer()
tokenizer.fit_on_texts(str)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences([str])
text = pad_sequences(sequences, maxlen=maxlen)
return text
def stemming(word):
stemmer = PorterStemmer()
list1=[]
for i in word.split():
list1.append(stemmer.stem(i))
return ' '.join(list1)
def create_text_column(df, title_col, body_col, prefix=" "): # Optional tags_cols and prefix arguments
"""
Creates a new column named 'text' in the DataFrame by combining title, body, and optional tags.
Args:
df (pandas.DataFrame): The DataFrame to process.
title_col (str): The name of the column containing the title text.
body_col (str): The name of the column containing the body markdown text.
tags_cols (list, optional): A list of column names containing tag text (defaults to None).
prefix (str, optional): A prefix to add before the body text (defaults to a space).
Returns:
pandas.DataFrame: The modified DataFrame with the new 'text' column.
"""
df['text'] = df.apply(lambda row: create_text_row(row[title_col], row[body_col], row, prefix), axis=1)
return df
def create_text_row(title, body, row=None, prefix=" "): # Helper function for row-wise processing
"""
Constructs the text string for a single row.
Args:
title (str): The title text.
body (str): The body markdown text.
tags_cols (list, optional): A list of column names containing tag text (defaults to None).
row (pandas.Series, optional): The entire row if tags_cols are not provided (defaults to None).
prefix (str, optional): A prefix to add before the body text (defaults to a space).
Returns:
str: The combined text string.
"""
text = f"Title: '{title}'"
text += f"\n{prefix}Body: '{body}'"
return text
# Assuming your DataFrame is named df
df = create_text_column(df, "Title", "BodyMarkdown", prefix=" ") # Example with tags_cols and prefix
# Alternatively, if tags are in separate columns starting with "Tag"
df = create_text_column(df, "Title", "BodyMarkdown")
def pro(df):
stopwords_english = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_english)]))
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
stemmer = PorterStemmer()
def stemming(word):
list1=[]
for i in word.split():
list1.append(stemmer.stem(i))
return ' '.join(list1)
df['text'] = df['text'].apply(lambda x:stemming(x))
def encode_labels(df, source_column, custom_labelling, default_value=None):
"""
Encodes categorical labels in a DataFrame column using a custom labeling scheme.
Args:
df (pandas.DataFrame): The DataFrame containing the labels to encode.
source_column (str): The name of the column containing the categorical labels.
custom_labelling (dict): A dictionary mapping string labels to numerical codes.
default_value (int, optional): The default value to assign for missing labels (defaults to None).
Returns:
list: A list of encoded labels corresponding to the rows in the DataFrame.
"""
encoded_labels = []
for label in df[source_column]:
encoded_label = custom_labelling.get(label, default_value)
if encoded_label is None:
print(f"Warning: Label '{label}' not found in custom_labelling. Using default value: {default_value}")
encoded_labels.append(encoded_label)
return encoded_labels
# Example usage
custom_labelling = {
'open': 0,
'not a real question': 1,
'not constructive': 2,
'too localized': 3,
'off topic': 4
}
encoded_labels = encode_labels(df, "OpenStatus", custom_labelling)