Sentiment Analysis using LSTM - Comparison with RNN
Using Stanford IMDB Movie Reviews Dataset
Back to Home
Step 1: Import Libraries
# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re, string, os
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from matplotlib.ticker import FormatStrFormatter
print("All the required libraries are imported.\n")
Step 2: Define Parameters
# Defining paths & Parameters
VOCAB_SIZE = 10000
MAX_LEN = 300
EMBED_DIM = 256
RNN_UNITS = 256
LSTM_UNITS = 64
BATCH_SIZE = 128
EPOCHS = 150
BASE_PATH = "/kaggle/input/standford-dataset/aclImdb"
TRAIN_POS = os.path.join(BASE_PATH, "train/pos")
TRAIN_NEG = os.path.join(BASE_PATH, "train/neg")
TEST_POS = os.path.join(BASE_PATH, "test/pos")
TEST_NEG = os.path.join(BASE_PATH, "test/neg")
print(f"Parameters for training are set as:")
print(f" Vocabulary Size: {VOCAB_SIZE}")
print(f" Maximum Sequence Length: {MAX_LEN}")
print(f" Embedding Dimensions: {EMBED_DIM}")
print(f" RNN units: {RNN_UNITS}")
print(f" LSTM units: {LSTM_UNITS}")
print(f" Batch Size: {BATCH_SIZE}")
print(f" Epochs: {EPOCHS}")
Step 3: Dataset Preprocessing
# Dataset Preprocessing
def clean_text(text):
text = text.lower()
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
text = re.sub(r'\w*\d\w*', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# ======================= LOAD REVIEWS =======================
def load_reviews(folder, label, max_samples):
texts, labels = [], []
files = sorted(os.listdir(folder))[:max_samples]
for f in files:
with open(os.path.join(folder, f), encoding="utf-8") as file:
texts.append(file.read())
labels.append(label)
return texts, labels
Step 4: Train-Val-Test Splitting
# Train-Val-Test Splitting
train_pos, y_train_pos = load_reviews(TRAIN_POS, 1, 4250)
train_neg, y_train_neg = load_reviews(TRAIN_NEG, 0, 4250)
X_train_full = np.array(train_pos + train_neg)
y_train_full = np.array(y_train_pos + y_train_neg)
test_pos, y_test_pos = load_reviews(TEST_POS, 1, 750)
test_neg, y_test_neg = load_reviews(TEST_NEG, 0, 750)
X_test = np.array(test_pos + test_neg)
y_test = np.array(y_test_pos + y_test_neg)
print("Train+Val:", len(X_train_full))
print("Test :", len(X_test))
X_train_full = np.array([clean_text(x) for x in X_train_full])
X_test = np.array([clean_text(x) for x in X_test])
X_train, X_val, y_train, y_val = train_test_split(
X_train_full,
y_train_full,
test_size=1500,
random_state=42,
stratify=y_train_full
)
print("Train:", len(X_train))
print("Val :", len(X_val))
print("Test :", len(X_test))
print("Train balance:", np.bincount(y_train))
print("Val balance :", np.bincount(y_val))
print("Test balance :", np.bincount(y_test))
# Tokenization
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)
X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_val_pad = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_LEN)
X_test_pad = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)
Step 5: Model Architectures & Checkpoints
# Defining Model Architectures & Checkpoints
# Simple RNN Model
model_rnn = Sequential([
Input(shape=(MAX_LEN,)),
Embedding(VOCAB_SIZE, EMBED_DIM),
SimpleRNN(RNN_UNITS, dropout=0.3, recurrent_dropout=0.1),
Dense(1, activation='sigmoid')
])
model_rnn.compile(
optimizer=Adam(3e-5),
loss='binary_crossentropy',
metrics=['accuracy']
)
# LSTM Model
model_lstm = Sequential([
Input(shape=(MAX_LEN,)),
Embedding(VOCAB_SIZE, EMBED_DIM),
LSTM(LSTM_UNITS, dropout=0.3),
Dense(1, activation='sigmoid')
])
model_lstm.compile(
optimizer=Adam(1e-5),
loss='binary_crossentropy',
metrics=['accuracy']
)
# Model Checkpoints
rnn_ckpt = ModelCheckpoint("best_rnn.keras", monitor="val_accuracy",
save_best_only=True, mode='max', verbose=0)
lstm_ckpt = ModelCheckpoint("best_lstm.keras", monitor="val_accuracy",
save_best_only=True, mode='max', verbose=0)
reduce_lr_rnn = ReduceLROnPlateau(
monitor="val_loss",
factor=0.5,
patience=6,
min_lr=1e-6,
verbose=0
)
reduce_lr_lstm = ReduceLROnPlateau(
monitor="val_loss",
factor=0.5,
patience=6,
min_lr=1e-6,
verbose=0
)
Step 6: Model Training
# Model Training
print("\nStarting training...\n")
print("\nTraining RNN...\n")
history_rnn = model_rnn.fit(
X_train_pad, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=(X_val_pad, y_val),
callbacks=[rnn_ckpt, reduce_lr_rnn],
verbose=0
)
print("\nTraining LSTM...\n")
history_lstm = model_lstm.fit(
X_train_pad, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=(X_val_pad, y_val),
callbacks=[lstm_ckpt, reduce_lr_lstm],
verbose=0
)
print("\nTraining completed.")
# Loading Best Models
best_rnn = load_model("best_rnn.keras")
best_lstm = load_model("best_lstm.keras")
print("\nBest models loaded for both RNN and LSTM.")
Step 7: Model Evaluation
# Model Evaluation
# Accuracy
print("\nFinal Accuracies-")
print("\n========= SIMPLE RNN =========")
print(f"Train Accuracy : {history_rnn.history['accuracy'][-1]*100:.2f}%")
print(f"Val Accuracy : {best_rnn.evaluate(X_val_pad, y_val, verbose=0)[1]*100:.2f}%")
print(f"Test Accuracy : {best_rnn.evaluate(X_test_pad, y_test, verbose=0)[1]*100:.2f}%")
print("\n========= LSTM =========")
print(f"Train Accuracy : {history_lstm.history['accuracy'][-1]*100:.2f}%")
print(f"Val Accuracy : {best_lstm.evaluate(X_val_pad, y_val, verbose=0)[1]*100:.2f}%")
print(f"Test Accuracy : {best_lstm.evaluate(X_test_pad, y_test, verbose=0)[1]*100:.2f}%")
# Classification Report
print("\nFinal Classification Reports-")
print("\n========= RNN Classification Report =========")
print(classification_report(y_test, y_pred_rnn, digits=4))
print("\n========= LSTM Classification Report =========")
print(classification_report(y_test, y_pred_lstm, digits=4))
Step 8: Graph Visualization