!pip install --upgrade datasets --quiet
Setup
Library Setup
import pandas as pd
import joblib
import time
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from datasets import load_dataset
Data Setup
# Mount Google Drive to save the final model
print("Mounting Google Drive...")
'/content/drive')
drive.mount(
# Load and prepare the dataset
print("\nLoading and preparing dataset...")
= load_dataset("imdb")
dataset = pd.concat([
df 'train']),
pd.DataFrame(dataset['test'])
pd.DataFrame(dataset[=True)
], ignore_index= df.rename(columns={'text': 'review', 'label': 'sentiment'})
df 'sentiment'] = df['sentiment'].map({0: 'Negative', 1: 'Positive'})
df[
# Create the train/test split
= df['review']
X = df['sentiment']
y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test, y_train, y_test print("Dataset ready.")
Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading and preparing dataset...
Dataset ready.
Building a Model
Define Model
I choose to use logistic regression.
# --- Define the Logistic Regression Pipeline ---
= Pipeline([
logreg_pipeline 'tfidf', TfidfVectorizer(stop_words='english')),
('logreg', LogisticRegression(solver='liblinear', max_iter=1000))
( ])
Run Grid Search
Next, I choose the parameters to test in the grid search.
# --- Define the Parameters for the Grid Search ---
= {
logreg_params 'tfidf__ngram_range': [(1, 1), (1, 2)],
'tfidf__min_df': [3, 5],
'logreg__C': [1, 10],
}
# --- Run the Grid Search ---
print("\--- Starting Grid Search for Logistic Regression ---")
= time.time()
start_time = GridSearchCV(logreg_pipeline, logreg_params, cv=3, n_jobs=-1, verbose=2)
grid_search_logreg
grid_search_logreg.fit(X_train, y_train)print(f"\nGrid Search completed in {time.time() - start_time:.2f} seconds.")
\--- Starting Grid Search for Logistic Regression ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Grid Search completed in 293.88 seconds.
Model Evaluation
print("--- Grid Search Results ---")
print(f"Best Cross-Validation Score: {grid_search_logreg.best_score_:.4f}")
print("Best Parameters Found:", grid_search_logreg.best_params_)
# The best model is already trained on the full training data
= grid_search_logreg.best_estimator_
best_model
# --- Final Evaluation on the Held-Out Test Set ---
print("\n--- Final Evaluation on Test Set ---")
= best_model.score(X_test, y_test)
final_accuracy print(f"Final accuracy on the test set: {final_accuracy:.4f}")
--- Grid Search Results ---
Best Cross-Validation Score: 0.8998
Best Parameters Found: {'logreg__C': 10, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2)}
--- Final Evaluation on Test Set ---
Final accuracy on the test set: 0.9080
# --- Save the Final Model ---
= 'imdb_model.joblib'
model_name = f'/content/drive/My Drive/{model_name}'
drive_path
joblib.dump(best_model, drive_path)print(f"Best model saved to Google Drive at: {drive_path}")
Best model saved to Google Drive at: /content/drive/My Drive/imdb_model.joblib