Setup

Library Setup

!pip install --upgrade datasets --quiet
import pandas as pd
import joblib
import time
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from datasets import load_dataset

Data Setup

# Mount Google Drive to save the final model
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Load and prepare the dataset
print("\nLoading and preparing dataset...")
dataset = load_dataset("imdb")
df = pd.concat([
    pd.DataFrame(dataset['train']),
    pd.DataFrame(dataset['test'])
], ignore_index=True)
df = df.rename(columns={'text': 'review', 'label': 'sentiment'})
df['sentiment'] = df['sentiment'].map({0: 'Negative', 1: 'Positive'})

# Create the train/test split
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Dataset ready.")
Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Loading and preparing dataset...
Dataset ready.

Building a Model

Define Model

I choose to use logistic regression.

# --- Define the Logistic Regression Pipeline ---
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('logreg', LogisticRegression(solver='liblinear', max_iter=1000))
])

Model Evaluation

print("--- Grid Search Results ---")
print(f"Best Cross-Validation Score: {grid_search_logreg.best_score_:.4f}")
print("Best Parameters Found:", grid_search_logreg.best_params_)

# The best model is already trained on the full training data
best_model = grid_search_logreg.best_estimator_

# --- Final Evaluation on the Held-Out Test Set ---
print("\n--- Final Evaluation on Test Set ---")
final_accuracy = best_model.score(X_test, y_test)
print(f"Final accuracy on the test set: {final_accuracy:.4f}")
--- Grid Search Results ---
Best Cross-Validation Score: 0.8998
Best Parameters Found: {'logreg__C': 10, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}

--- Final Evaluation on Test Set ---
Final accuracy on the test set: 0.9080
# --- Save the Final Model ---
model_name = 'imdb_model.joblib'
drive_path = f'/content/drive/My Drive/{model_name}'
joblib.dump(best_model, drive_path)
print(f"Best model saved to Google Drive at: {drive_path}")
Best model saved to Google Drive at: /content/drive/My Drive/imdb_model.joblib