Sentiment Analysis using LSTM - Comparison with RNN

Step 1: Import Libraries

# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re, string, os

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from matplotlib.ticker import FormatStrFormatter

print("All the required libraries are imported.\n")

Step 2: Define Parameters

# Defining paths & Parameters
VOCAB_SIZE = 10000
MAX_LEN = 300
EMBED_DIM = 256
RNN_UNITS = 256
LSTM_UNITS = 64
BATCH_SIZE = 128
EPOCHS = 150

BASE_PATH = "/kaggle/input/standford-dataset/aclImdb"
TRAIN_POS = os.path.join(BASE_PATH, "train/pos")
TRAIN_NEG = os.path.join(BASE_PATH, "train/neg")
TEST_POS  = os.path.join(BASE_PATH, "test/pos")
TEST_NEG  = os.path.join(BASE_PATH, "test/neg")

print(f"Parameters for training are set as:")
print(f"  Vocabulary Size: {VOCAB_SIZE}")
print(f"  Maximum Sequence Length: {MAX_LEN}")
print(f"  Embedding Dimensions: {EMBED_DIM}")
print(f"  RNN units: {RNN_UNITS}")
print(f"  LSTM units: {LSTM_UNITS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")

Step 3: Dataset Preprocessing

# Dataset Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ======================= LOAD REVIEWS =======================
def load_reviews(folder, label, max_samples):
    texts, labels = [], []
    files = sorted(os.listdir(folder))[:max_samples]
    for f in files:
        with open(os.path.join(folder, f), encoding="utf-8") as file:
            texts.append(file.read())
            labels.append(label)
    return texts, labels

Step 4: Train-Val-Test Splitting

# Train-Val-Test Splitting
train_pos, y_train_pos = load_reviews(TRAIN_POS, 1, 4250)
train_neg, y_train_neg = load_reviews(TRAIN_NEG, 0, 4250)

X_train_full = np.array(train_pos + train_neg)
y_train_full = np.array(y_train_pos + y_train_neg)

test_pos, y_test_pos = load_reviews(TEST_POS, 1, 750)
test_neg, y_test_neg = load_reviews(TEST_NEG, 0, 750)

X_test = np.array(test_pos + test_neg)
y_test = np.array(y_test_pos + y_test_neg)

print("Train+Val:", len(X_train_full))
print("Test     :", len(X_test))

X_train_full = np.array([clean_text(x) for x in X_train_full])
X_test       = np.array([clean_text(x) for x in X_test])

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=1500,
    random_state=42,
    stratify=y_train_full
)

print("Train:", len(X_train))
print("Val  :", len(X_val))
print("Test :", len(X_test))

print("Train balance:", np.bincount(y_train))
print("Val balance  :", np.bincount(y_val))
print("Test balance :", np.bincount(y_test))

# Tokenization
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_val_pad   = pad_sequences(tokenizer.texts_to_sequences(X_val),   maxlen=MAX_LEN)
X_test_pad  = pad_sequences(tokenizer.texts_to_sequences(X_test),  maxlen=MAX_LEN)

Step 5: Model Architectures & Checkpoints

# Defining Model Architectures & Checkpoints

# Simple RNN Model
model_rnn = Sequential([
    Input(shape=(MAX_LEN,)),
    Embedding(VOCAB_SIZE, EMBED_DIM),
    SimpleRNN(RNN_UNITS, dropout=0.3, recurrent_dropout=0.1),
    Dense(1, activation='sigmoid')
])

model_rnn.compile(
    optimizer=Adam(3e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# LSTM Model
model_lstm = Sequential([
    Input(shape=(MAX_LEN,)),
    Embedding(VOCAB_SIZE, EMBED_DIM),
    LSTM(LSTM_UNITS, dropout=0.3),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(
    optimizer=Adam(1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Model Checkpoints
rnn_ckpt  = ModelCheckpoint("best_rnn.keras", monitor="val_accuracy",
                            save_best_only=True, mode='max', verbose=0)
lstm_ckpt = ModelCheckpoint("best_lstm.keras", monitor="val_accuracy",
                            save_best_only=True, mode='max', verbose=0)

reduce_lr_rnn = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=6,
    min_lr=1e-6,
    verbose=0
)

reduce_lr_lstm = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=6,
    min_lr=1e-6,
    verbose=0
)

Step 6: Model Training

# Model Training

print("\nStarting training...\n")
print("\nTraining RNN...\n")
history_rnn = model_rnn.fit(
    X_train_pad, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_pad, y_val),
    callbacks=[rnn_ckpt, reduce_lr_rnn],
    verbose=0
)

print("\nTraining LSTM...\n")
history_lstm = model_lstm.fit(
    X_train_pad, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val_pad, y_val),
    callbacks=[lstm_ckpt, reduce_lr_lstm],
    verbose=0
)
print("\nTraining completed.")

# Loading Best Models
best_rnn  = load_model("best_rnn.keras")
best_lstm = load_model("best_lstm.keras")
print("\nBest models loaded for both RNN and LSTM.")

Step 7: Model Evaluation

# Model Evaluation

# Accuracy
print("\nFinal Accuracies-")
print("\n========= SIMPLE RNN =========")
print(f"Train Accuracy : {history_rnn.history['accuracy'][-1]*100:.2f}%")
print(f"Val Accuracy   : {best_rnn.evaluate(X_val_pad, y_val, verbose=0)[1]*100:.2f}%")
print(f"Test Accuracy  : {best_rnn.evaluate(X_test_pad, y_test, verbose=0)[1]*100:.2f}%")

print("\n========= LSTM =========")
print(f"Train Accuracy : {history_lstm.history['accuracy'][-1]*100:.2f}%")
print(f"Val Accuracy   : {best_lstm.evaluate(X_val_pad, y_val, verbose=0)[1]*100:.2f}%")
print(f"Test Accuracy  : {best_lstm.evaluate(X_test_pad, y_test, verbose=0)[1]*100:.2f}%")

# Classification Report
print("\nFinal Classification Reports-")
print("\n========= RNN Classification Report =========")
print(classification_report(y_test, y_pred_rnn, digits=4))

print("\n========= LSTM Classification Report =========")
print(classification_report(y_test, y_pred_lstm, digits=4))

Step 8: Graph Visualization

View Plots For:

Sentiment Analysis using LSTM - Comparison with RNN

Final Accuracies

Simple RNN

LSTM

Classification Reports

RNN Classification Report

LSTM Classification Report

Experiment Completed!