Setup

Library Setup

!pip install --upgrade datasets --quiet

import pandas as pd
import joblib
import time
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from datasets import load_dataset

Data Setup

# Mount Google Drive to save the final model
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Load and prepare the dataset
print("\nLoading and preparing dataset...")
dataset = load_dataset("imdb")
df = pd.concat([
    pd.DataFrame(dataset['train']),
    pd.DataFrame(dataset['test'])
], ignore_index=True)
df = df.rename(columns={'text': 'review', 'label': 'sentiment'})
df['sentiment'] = df['sentiment'].map({0: 'Negative', 1: 'Positive'})

# Create the train/test split
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Dataset ready.")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Loading and preparing dataset...
Dataset ready.

Building a Model

Define Model

I choose to use logistic regression.

# --- Define the Logistic Regression Pipeline ---
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('logreg', LogisticRegression(solver='liblinear', max_iter=1000))
])

Run Grid Search

Next, I choose the parameters to test in the grid search.

# --- Define the Parameters for the Grid Search ---
logreg_params = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [3, 5],
    'logreg__C': [1, 10],
}

# --- Run the Grid Search ---
print("\--- Starting Grid Search for Logistic Regression ---")
start_time = time.time()
grid_search_logreg = GridSearchCV(logreg_pipeline, logreg_params, cv=3, n_jobs=-1, verbose=2)
grid_search_logreg.fit(X_train, y_train)
print(f"\nGrid Search completed in {time.time() - start_time:.2f} seconds.")

\--- Starting Grid Search for Logistic Regression ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Grid Search completed in 293.88 seconds.

Model Evaluation

print("--- Grid Search Results ---")
print(f"Best Cross-Validation Score: {grid_search_logreg.best_score_:.4f}")
print("Best Parameters Found:", grid_search_logreg.best_params_)

# The best model is already trained on the full training data
best_model = grid_search_logreg.best_estimator_

# --- Final Evaluation on the Held-Out Test Set ---
print("\n--- Final Evaluation on Test Set ---")
final_accuracy = best_model.score(X_test, y_test)
print(f"Final accuracy on the test set: {final_accuracy:.4f}")

--- Grid Search Results ---
Best Cross-Validation Score: 0.8998
Best Parameters Found: {'logreg__C': 10, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}

--- Final Evaluation on Test Set ---
Final accuracy on the test set: 0.9080

# --- Save the Final Model ---
model_name = 'imdb_model.joblib'
drive_path = f'/content/drive/My Drive/{model_name}'
joblib.dump(best_model, drive_path)
print(f"Best model saved to Google Drive at: {drive_path}")

Best model saved to Google Drive at: /content/drive/My Drive/imdb_model.joblib