!pip install --upgrade datasets --quietSetup
Library Setup
import pandas as pd
import joblib
import time
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from datasets import load_datasetData Setup
# Mount Google Drive to save the final model
print("Mounting Google Drive...")
drive.mount('/content/drive')
# Load and prepare the dataset
print("\nLoading and preparing dataset...")
dataset = load_dataset("imdb")
df = pd.concat([
pd.DataFrame(dataset['train']),
pd.DataFrame(dataset['test'])
], ignore_index=True)
df = df.rename(columns={'text': 'review', 'label': 'sentiment'})
df['sentiment'] = df['sentiment'].map({0: 'Negative', 1: 'Positive'})
# Create the train/test split
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Dataset ready.")Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading and preparing dataset...
Dataset ready.
Building a Model
Define Model
I choose to use logistic regression.
# --- Define the Logistic Regression Pipeline ---
logreg_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words='english')),
('logreg', LogisticRegression(solver='liblinear', max_iter=1000))
])Run Grid Search
Next, I choose the parameters to test in the grid search.
# --- Define the Parameters for the Grid Search ---
logreg_params = {
'tfidf__ngram_range': [(1, 1), (1, 2)],
'tfidf__min_df': [3, 5],
'logreg__C': [1, 10],
}# --- Run the Grid Search ---
print("\--- Starting Grid Search for Logistic Regression ---")
start_time = time.time()
grid_search_logreg = GridSearchCV(logreg_pipeline, logreg_params, cv=3, n_jobs=-1, verbose=2)
grid_search_logreg.fit(X_train, y_train)
print(f"\nGrid Search completed in {time.time() - start_time:.2f} seconds.")\--- Starting Grid Search for Logistic Regression ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Grid Search completed in 293.88 seconds.
Model Evaluation
print("--- Grid Search Results ---")
print(f"Best Cross-Validation Score: {grid_search_logreg.best_score_:.4f}")
print("Best Parameters Found:", grid_search_logreg.best_params_)
# The best model is already trained on the full training data
best_model = grid_search_logreg.best_estimator_
# --- Final Evaluation on the Held-Out Test Set ---
print("\n--- Final Evaluation on Test Set ---")
final_accuracy = best_model.score(X_test, y_test)
print(f"Final accuracy on the test set: {final_accuracy:.4f}")--- Grid Search Results ---
Best Cross-Validation Score: 0.8998
Best Parameters Found: {'logreg__C': 10, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}
--- Final Evaluation on Test Set ---
Final accuracy on the test set: 0.9080
# --- Save the Final Model ---
model_name = 'imdb_model.joblib'
drive_path = f'/content/drive/My Drive/{model_name}'
joblib.dump(best_model, drive_path)
print(f"Best model saved to Google Drive at: {drive_path}")Best model saved to Google Drive at: /content/drive/My Drive/imdb_model.joblib