proyek_nlp_millata_tasyakhanifa.py

# -*- coding: utf-8 -*-
"""Proyek_NLP_Millata Tasyakhanifa.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ihBvaNAFB0tzPQFt5eLwVx7uY36D-QID

# Determine Products is Recommended or Not Based on Review Texts from Reviewers

### Nama: Millata Tasyakhanifa
### Username: millatasyaa
### Email: millatatasyakhanifa@gmail.com
"""

!nvidia-smi

"""## Import Library"""

import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

"""## Read Dataset"""

df = pd.read_csv("/content/Womens Clothing E-Commerce Reviews.csv", sep=',')

df

df.info()

"""## Drop unnecessary features"""

df = df.drop(['Title', 'Clothing ID', 'Positive Feedback Count'], axis=1)
df.head()

"""## Drop Missing Values"""

df.isnull().sum()

df.dropna(axis=0, subset=['Review Text', 'Division Name', 'Department Name','Class Name'], inplace=True)

df.isnull().sum().sum()

"""## Do One Hot Encoding on Recommended IND column"""

rec_ind = pd.get_dummies(df['Recommended IND'])

new_df = pd.concat([df, rec_ind], axis=1)
new_df = new_df.drop(columns='Recommended IND')
new_df

"""## Tokenization"""

tokenizer = Tokenizer(num_words= 5000, oov_token='x', filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)

tokenizer.fit_on_texts(df['Review Text'].values)

X = tokenizer.texts_to_sequences(df['Review Text'].values)

print(len(tokenizer.word_index))

print(tokenizer.word_index)

X = pad_sequences(X)
print(X)

X.shape

"""# Split Dataset"""

y = rec_ind

y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

"""# Make Model"""

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(50000, 32, input_length=X.shape[1]),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(100, dropout=0.4),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
model.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])

class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.9):
      print("\nAkurasi telah mencapai >90%!")
      self.model.stop_training = True
callbacks = myCallback()

history = model.fit(X_train, 
                    y_train, 
                    epochs=100,
                    batch_size=64,
                    validation_split=0.2, # Validation set = 20% 
                    callbacks=[callbacks])

"""## Loss and Accuracy Plots During Training and Validation"""

plt.figure(figsize=(18, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Train and Validation Accuracy Graphs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Train and Validation Loss Graphs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()