# View and modify the working pathimport osfrom google.colab import drive# View current working directoryprint("Current Working *Directory*:", os.getcwd())# Mount Google Drivedrive.mount('/content/gdrive')# Change working directory to your file positionpath ="/content/gdrive/My Drive/BD4H-project-main/code"os.chdir(path)# Confirm the changeprint("Working Directory:", os.getcwd())
Current Working *Directory*: /content
Mounted at /content/gdrive
Working Directory: /content/gdrive/My Drive/BD4H-project/code
Initialize Paper (Xiao et al., 2018) Parameters
TAs - for the purpose of faster runtime, we are intentionally setting the parameters “num_epochs” to 2, and “num_trials” to 2, even though Xiao et al. use 6 and 10, respectfully. To replicate the original paper, evaluation outputs you see at the end (See “Executing Models” Section) are from using the original 6 epoch / 10 trial parameters.
import osimport pickleimport timeimport numpy as npimport pandas as pdfrom statistics import mean, stdevimport torchimport torch.nn as nnimport torch.nn.functional as Ffrom torch.utils.data import Dataset, DataLoaderfrom sklearn.metrics import ( precision_recall_fscore_support, roc_auc_score, accuracy_score, precision_recall_curve,)from sklearn.metrics import average_precision_score as pr_aucfrom sklearn.cluster import MiniBatchKMeansfrom sklearn.manifold import TSNEimport matplotlib.pyplot as pltimport zipfileclass Config:""" Holds hyperparameters, file paths, and general settings. In practice, you could store these in a YAML/JSON file. """# Data paths dataset_dir ="../resource" zipped_file = os.path.join(dataset_dir, "S1_Data.zip") input_file = os.path.join(dataset_dir, "S1_Data.txt") # after unzipping vocab_file = os.path.join(dataset_dir, "vocab.txt") stop_file = os.path.join(dataset_dir, "stop.txt") vocab_pkl = os.path.join(dataset_dir, "vocab.pkl")# PKLs for train, valid, test data pkl_train_x = os.path.join(dataset_dir, "X_train.pkl") pkl_train_y = os.path.join(dataset_dir, "Y_train.pkl") pkl_val_x = os.path.join(dataset_dir, "X_valid.pkl") pkl_val_y = os.path.join(dataset_dir, "Y_valid.pkl") pkl_test_x = os.path.join(dataset_dir, "X_test.pkl") pkl_test_y = os.path.join(dataset_dir, "Y_test.pkl")# For building the vocab rare_word_threshold =100 stop_word_threshold =1e4 unknown_index =1 vocab_size =490 n_stops =12# last 12 are considered "stop words" n_topics =50 max_visit_len =300# Model hyperparams embed_size =100 hidden_size =200# Training settings batch_size =1 grad_clip =100 learning_rate =0.001 num_epochs =2 num_trials =2# Where to save intermediate outputs content_theta_dir ="../output/CONTENT_theta" gru_hiddens_dir ="../output/GRU_hiddens" content_results_dir ="../output/CONTENT_results" gru_results_dir ="../output/GRU_results" device = torch.device("cuda"if torch.cuda.is_available() else"cpu")print(f"[Config] Device: {device}")
def save_pkl(path, obj):withopen(path, "wb") as f: pickle.dump(obj, f)print(f"Saved: {path}")def load_pkl(path):withopen(path, "rb") as f: obj = pickle.load(f)print(f"Loaded: {path}")return objdef save_npy(path, arr): np.save(path, arr)print(f"Saved: {path}")def load_npy(path): arr = np.load(path, allow_pickle=True)print(f"Loaded: {path}")return arrdef ensure_data_unzipped(config: Config):""" Checks if the unzipped S1_Data.txt exists. If not, unzips S1_Data.zip. Sets config.input_file to the unzipped file. """if os.path.exists(config.input_file):print(f"S1_Data.txt already unzipped at: {config.input_file}")returnwith zipfile.ZipFile(config.zipped_file, 'r') as zip_ref: zip_ref.extractall(config.dataset_dir)print(f"Unzipped {config.zipped_file} => {config.dataset_dir}")def build_vocab(config: Config):""" Creates vocab.txt and stop.txt from S1_Data.txt by filtering. Index offset so that 'unknown_index' can be used. """ df = pd.read_csv(config.input_file, sep="\t", header=0) grouped = df.groupby("DX_GROUP_DESCRIPTION").size().reset_index(name="SIZE")# Filter out rare grouped = grouped[grouped["SIZE"] > config.rare_word_threshold]# Sort by frequency ascending grouped = grouped.sort_values(by="SIZE").reset_index(drop=True) vocab = grouped["DX_GROUP_DESCRIPTION"] vocab.index +=2# offset => index=1 is reserved for unknown vocab.to_csv(config.vocab_file, sep="\t", header=False, index=True)print("Number of valid tokens:", len(vocab))# Stop words => extremely frequent stops = grouped[grouped["SIZE"] > config.stop_word_threshold] stops["DX_GROUP_DESCRIPTION"].to_csv(config.stop_file, sep="\t", header=False, index=False)def load_vocab_dict(config: Config):""" Reads vocab_file => returns {word: index}, also pickles reverse mapping. """ word_to_index = {}withopen(config.vocab_file, "r") as f:for line in f: idx_str, token = line.strip().split("\t") word_to_index[token] =int(idx_str) -1 reverse_mapping = {v: k for k, v in word_to_index.items()} save_pkl(config.vocab_pkl, reverse_mapping)print(f"Vocab size: {len(word_to_index)}")return word_to_indexdef loadEmbeddingMatrix(wordvecFile, word_to_index, vocab_size):# Build reverse mapping: index -> wordwithopen(wordvecFile, "r") as fw:# Read the header: total number of words and embedding dimension header = fw.readline().strip().split() total, dim =int(header[0]), int(header[1])# Initialize the embedding matrix with zeros W = np.zeros((vocab_size, dim), dtype=np.float32)for line in fw: parts = line.strip().split()# Reconstruct the word (in case it contains spaces) word =" ".join(parts[:-dim]) vec = np.array(parts[-dim:], dtype=np.float32)try: token_value = word_to_index[word] # Get the token index from the reverse mappingexceptKeyError:print(f"{word} is not in vocabulary; skipping.")continue# Adjust index if your vocab mapping is 1-indexed; here we subtract 1. W[token_value -1] = vecreturn W
Data Preprocessing and Splitting
def extract_inpatient_events(config: Config):""" Extracts 'INPATIENT HOSPITAL' events => used to mark readmission (1) if next event is within 30 days. """ df = pd.read_csv(config.input_file, sep="\t", header=0) inpat = df[df["SERVICE_LOCATION"] =="INPATIENT HOSPITAL"] grouped = (inpat.groupby(["PID", "DAY_ID", "SERVICE_LOCATION"]) .size() .reset_index(name="COUNT") .sort_values(["PID", "DAY_ID"], ascending=True) .set_index("PID"))return groupeddef convert_format(config: Config, word2idx, inpat_df):""" Goes through S1_Data.txt line by line => builds docs & labels. docs[i] = [ [codes_on_day1], [codes_on_day2], ... ] labels[i] = [0/1 for each day] """def is_readmitted(pid, day):try: recs = inpat_df.loc[int(pid)]ifisinstance(recs, pd.Series):return (int(day) <= recs.DAY_ID <int(day) +30)# Else a sub-DataFrame subset = recs.loc[(int(day) <= recs.DAY_ID) & (recs.DAY_ID <int(day) +30)]return subset.shape[0] >0exceptKeyError:returnFalse docs, labels = [], []withopen(config.input_file, "r") as f: header = f.readline().strip().split("\t") col_idx = {h: i for i, h inenumerate(header)} doc, visit_codes, label_seq = [], [], [] line = f.readline()ifnot line:return docs, labels tokens = line.strip().split("\t") pid, day_id = tokens[col_idx["PID"]], tokens[col_idx["DAY_ID"]] label_seq.append(1if is_readmitted(pid, day_id) else0)while line: tokens = line.strip().split("\t") c_pid, c_day = tokens[col_idx["PID"]], tokens[col_idx["DAY_ID"]]if c_pid != pid: doc.append(visit_codes) docs.append(doc) labels.append(label_seq)# Reset doc, visit_codes, label_seq = [], [], [] pid, day_id = c_pid, c_day label_seq.append(1if is_readmitted(pid, day_id) else0)else:# Same patient, check if new dayif c_day != day_id: doc.append(visit_codes) visit_codes = [] day_id = c_day label_seq.append(1if is_readmitted(pid, day_id) else0) diag_str = tokens[col_idx["DX_GROUP_DESCRIPTION"]] diag_idx = word2idx.get(diag_str, config.unknown_index) visit_codes.append(diag_idx) line = f.readline()# Finalize doc.append(visit_codes) docs.append(doc) labels.append(label_seq)return docs, labelsdef split_and_save(config: Config, docs, labels):""" Splits docs/labels => train/valid/test and saves as pkl. """# Adjust these splits as needed. We match the authors train_end =2000 val_end =2500 save_pkl(config.pkl_train_x, docs[:train_end]) save_pkl(config.pkl_train_y, labels[:train_end]) save_pkl(config.pkl_val_x, docs[train_end:val_end]) save_pkl(config.pkl_val_y, labels[train_end:val_end]) save_pkl(config.pkl_test_x, docs[val_end:]) save_pkl(config.pkl_test_y, labels[val_end:])
PyTorch Dataset & DataLoader
class PatientVisitsDataset(Dataset):""" A PyTorch Dataset that holds (docs, labels) for a split (train/valid/test). We'll convert them to multi-hot within __getitem__ or in a collate_fn. """def__init__(self, docs, labels, vocab_size, max_len):super().__init__()self.docs = docsself.labels = labelsself.vocab_size = vocab_sizedef__len__(self):returnlen(self.docs)def__getitem__(self, idx):returnself.docs[idx], self.labels[idx]def multi_hot_collate_fn(batch, vocab_size, max_len):""" Collate function to transform a list of (doc, label) => (x, y, mask). Each doc is a list of visits. We'll multi-hot each visit, padding each sample to a fixed length (max_len). """ batch_size =len(batch)# 1) Separate docs and labels. docs = [b[0] for b in batch] labels = [b[1] for b in batch]# 2) Enforce each sample to have exactly max_len visits (truncating if longer, padding with zeros if shorter). x_array = np.zeros((batch_size, max_len, vocab_size), dtype=np.float32) y_array = np.ones((batch_size, max_len), dtype=np.float32) mask_array = np.zeros((batch_size, max_len), dtype=np.float32)for i, (doc, lab) inenumerate(zip(docs, labels)):# Use the minimum between the number of visits in the doc and max_len. seq_len =min(len(doc), max_len) mask_array[i, :seq_len] =1 y_array[i, :seq_len] = lab[:seq_len]for j inrange(seq_len): visit_codes = doc[j]for code_idx in visit_codes:# Adjust for 0-indexed coding; your vocabulary indices seem to be 1-indexed. x_array[i, j, code_idx -1] =1# Convert numpy arrays to torch tensors. x_tensor = torch.from_numpy(x_array) y_tensor = torch.from_numpy(y_array) mask_tensor = torch.from_numpy(mask_array)return x_tensor, y_tensor, mask_tensordef create_dataloader(docs, labels, config: Config, shuffle=False):""" Convenience method to build a DataLoader from docs/labels. """ dataset = PatientVisitsDataset(docs, labels, config.vocab_size, config.max_visit_len) loader = DataLoader( dataset, batch_size=config.batch_size, shuffle=shuffle, collate_fn=lambda b: multi_hot_collate_fn(b, config.vocab_size, config.max_visit_len), )return loader
Executing Data Prep
You can comment out 1 and 2 if already run - loads existing data.
label_distribution = {1: 0, 0: 0}for l in labels:for num in l: label_distribution[num] +=1print("Label 1 has {} counts, and Label 0 has {} counts".format(label_distribution[1], label_distribution[0]))
Label 1 has 54427 counts, and Label 0 has 185509 counts
class GRUModel(nn.Module):""" A basic GRU model without word embeddings or topic modeling: 1) direct input => GRU 2) GRU output => prediction """def__init__(self, config: Config):super().__init__()self.vocab_size = config.vocab_sizeself.hidden_size = config.hidden_size# Direct GRU without embedding layerself.gru = nn.GRU(self.vocab_size, self.hidden_size, batch_first=True)# Output layerself.out_layer = nn.Linear(self.hidden_size, 1)# For compatibility with ContentModel APIself.kl_term =0.0def forward(self, x, mask):# x => [batch, seq_len, vocab_size]# Apply mask to input masked_input = x * mask.unsqueeze(-1) # [B, T, vocab_size]# Pass through GRU gru_out, h_n =self.gru(masked_input)# Generate scores logits =self.out_layer(gru_out).squeeze(-1) # [B, T]# Apply sigmoid and mask out = torch.sigmoid(logits) out = out * mask # zeros for padded positions out = torch.clamp(out, min=1e-6, max=1-1e-6) # ensure strictly in (0,1)# For compatibility with ContentModel API, return None as theta# Ensures compatibility with train() and evaluate_model()return out, h_n, None
Defining Training Function
def train(model, loader, optimizer, config: Config):""" Train step over 'loader' => returns average train loss, plus any collected [theta+hidden]. """ model.train() total_loss =0. batch_count=0 collector = []for x_batch, y_batch, m_batch in loader: x_batch = x_batch.to(config.device) y_batch = y_batch.to(config.device) m_batch = m_batch.to(config.device) optimizer.zero_grad()# Handle different model typesifisinstance(model, ContentModel): preds, h_n, theta = model(x_batch, m_batch) bce = F.binary_cross_entropy(preds, y_batch, reduction='none') bce = (bce * m_batch).sum() / m_batch.sum() # mean over real visits loss = bce + model.kl_term # add KL term from model# Store [theta + hidden] rnn_vec = h_n.squeeze(0).detach().cpu().numpy() # [B, H] theta_np = theta[:, 0, :].detach().cpu().numpy() # [B, K] ← collapse T combined = np.concatenate([theta_np, rnn_vec], axis=1) # [B, K+H] collector.append(combined)elifisinstance(model, GRUModel): preds, h_n, _ = model(x_batch, m_batch) bce = F.binary_cross_entropy(preds, y_batch, reduction='none') bce = (bce * m_batch).sum() / m_batch.sum() # mean over real visits loss = bce# Store just the hidden for GRU rnn_vec = h_n.squeeze(0).detach().cpu().numpy() collector.append(rnn_vec) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) optimizer.step() total_loss += loss.item() batch_count+=1 avg_loss = total_loss /max(batch_count,1)return avg_loss, collector
Defining Evaluation Functions
def evaluate_model(model, loader, config: Config):""" Evaluates on a DataLoader (e.g. test/valid). Returns (avg_loss, list_of_true, list_of_pred, theta_hidden_collector). """ model.eval() total_loss =0.0 batch_count=0 all_true, all_pred = [], [] all_theta_hidden = []with torch.no_grad():for x_batch, y_batch, mask_batch in loader: x_batch = x_batch.to(config.device) y_batch = y_batch.to(config.device) mask_batch = mask_batch.to(config.device)ifisinstance(model, ContentModel): preds, h_n, theta = model(x_batch, mask_batch) bce = F.binary_cross_entropy(preds, y_batch, reduction='none') bce = (bce * mask_batch).sum() / mask_batch.sum() # mean over real visits loss = bce + model.kl_term # add KL term from model# Store [theta + hidden] rnn_vec = h_n.squeeze(0).detach().cpu().numpy() # [B, H] theta_np = theta[:, 0, :].detach().cpu().numpy() # [B, K] ← collapse T combined = np.concatenate([theta_np, rnn_vec], axis=1) # [B, K+H] all_theta_hidden.append(combined)elifisinstance(model, GRUModel): preds, h_n, _ = model(x_batch, mask_batch) bce = F.binary_cross_entropy(preds, y_batch, reduction='none') bce = (bce * mask_batch).sum() / mask_batch.sum() # mean over real visits loss = bce# Store just the hidden for GRU rnn_vec = h_n.squeeze(0).detach().cpu().numpy() all_theta_hidden.append(rnn_vec) total_loss += loss.item() batch_count +=1# flatten predictions/labels ignoring masked positions seq_lens = mask_batch.sum(dim=1).cpu().numpy().astype(int) preds_np = preds.detach().cpu().numpy() y_np = y_batch.detach().cpu().numpy()for i inrange(x_batch.shape[0]): length_i = seq_lens[i] all_pred.extend(preds_np[i, :length_i]) all_true.extend(y_np[i, :length_i]) avg_loss = total_loss / (batch_count if batch_count else1)return avg_loss, all_true, all_pred, all_theta_hiddendef compute_metrics(true_vals, pred_vals):""" Returns a dict with AUC, PR-AUC, ACC, Precision, Recall, F1. """ auc_val = roc_auc_score(true_vals, pred_vals) pr_val = pr_auc(true_vals, pred_vals) preds_bin = (np.array(pred_vals) >0.5).astype(int) prec, rec, f1, _ = precision_recall_fscore_support(true_vals, preds_bin, average="binary") acc_val = accuracy_score(true_vals, preds_bin)return {"auc": auc_val,"prauc": pr_val,"acc": acc_val,"precision": prec,"recall": rec,"f1": f1 }
OPTIONAL Grid Search
Defining Grid Search Function
def grid_search(param_grid, model_type): best_val_metric =0 best_config =Nonefor hidden_size, batch_size, learning_rate, num_epochs, n_topics in itertools.product( param_grid["hidden_size"], param_grid["batch_size"], param_grid["learning_rate"], param_grid["num_epochs"], param_grid["n_topics"] ):print("Current parameters: ")print(f"Hidden Size: {hidden_size}")print(f"Batch Size: {batch_size}")print(f"Learning Rate: {learning_rate}")print(f"Num Epochs: {num_epochs}")print(f"Num Topics; {n_topics}") config = Config() # Initialize a new configuration instance config.hidden_size = hidden_size config.batch_size = batch_size config.learning_rate = learning_rate config.num_epochs = num_epochs config.n_topics = n_topics# (Re)create your DataLoaders with the updated batch size if necessary train_loader = create_dataloader(X_train, Y_train, config, shuffle=True) valid_loader = create_dataloader(X_valid, Y_valid, config, shuffle=False)# Build model and optimizer with these settings model = model_type(config).to(config.device) optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) train(model, train_loader, optimizer, config) # Your training function call best_val_metric_this_run =0 val_loss, val_true, val_pred, val_theta_collector = evaluate_model(model, valid_loader, config) val_metrics = compute_metrics(val_true, val_pred)print("\nPR-AUC: {:.4f}".format(val_metrics["prauc"]))print("------------------\n")if val_metrics["prauc"] > best_val_metric_this_run: #using PR-AUC as metric best_val_metric_this_run = val_metrics["prauc"]# Update the best overall configuration if the current run outperforms previous runsif best_val_metric_this_run > best_val_metric: best_val_metric = best_val_metric_this_run best_config = {"hidden_size": hidden_size,"batch_size": batch_size,"learning_rate": learning_rate,"num_epochs": num_epochs,"n_topics": n_topics }return best_val_metric, best_config
Executing Grid Search
import osimport itertools# DEFINE HYPERPARAMETERS YOU WANT TO TRY HEREparam_grid = {"hidden_size": [100],"batch_size": [1],"learning_rate": [0.001, 0.005],"num_epochs": [2, 6],"n_topics": [50, 150]}# Run grid search on CONTENTbest_val_metric, best_config = grid_search(param_grid, ContentModel)# Run grid search on GRUbest_val_metric, best_config = grid_search(param_grid, GRUModel)# Print resultsprint("Best validation PR-AUC:", best_val_metric)print("Best hyperparameters:", best_config)
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 2
Num Topics; 50
PR-AUC: 0.6432
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 2
Num Topics; 150
PR-AUC: 0.6440
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 6
Num Topics; 50
PR-AUC: 0.6457
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 6
Num Topics; 150
PR-AUC: 0.6455
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 2
Num Topics; 50
PR-AUC: 0.6389
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 2
Num Topics; 150
PR-AUC: 0.6300
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 6
Num Topics; 50
PR-AUC: 0.6399
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 6
Num Topics; 150
PR-AUC: 0.6413
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 2
Num Topics; 50
PR-AUC: 0.6436
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 2
Num Topics; 150
PR-AUC: 0.6454
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 6
Num Topics; 50
PR-AUC: 0.6446
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.001
Num Epochs: 6
Num Topics; 150
PR-AUC: 0.6416
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 2
Num Topics; 50
PR-AUC: 0.6423
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 2
Num Topics; 150
PR-AUC: 0.6429
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 6
Num Topics; 50
PR-AUC: 0.6428
------------------
Current parameters:
Hidden Size: 100
Batch Size: 1
Learning Rate: 0.005
Num Epochs: 6
Num Topics; 150
PR-AUC: 0.6392
------------------
Best validation PR-AUC: 0.6454028876871399
Best hyperparameters: {'hidden_size': 100, 'batch_size': 1, 'learning_rate': 0.001, 'num_epochs': 2, 'n_topics': 150}
Update Config to Optimized Hyperparameters for Final Training + Testing
This code block updates the Config so that the paper parameters are updated to the optimized parameters we found using the Grid Search function above. Since this is an extention to the original replication, we are commenting it out. As a result, the model will be trained and evaluated using the original paper’s parameters.
# ---- If you want to use Original Paper Parameters, DO NOT RUN THIS!! ----# this code block updates the config so that the paper parameters are updated to# the optimized parameters found using the Grid Search function above#config.embed_size = best_config['embed_size']#config.hidden_size = best_config['hidden_size']#config.batch_size = best_config['batch_size']#config.learning_rate = best_config['learning_rate']#config.num_epochs = best_config['num_epochs']# (Re)create your Training and Testing DataLoaders with the updated batch size if necessary# train_loader = create_dataloader(X_train, Y_train, config, shuffle=True)# test_loader = create_dataloader(X_test, Y_test, config, shuffle=False)
For reference, here are the optimal parameters we found using our Grid Search function.
Optimal hyperparams we found for CONTENT:
hidden_size = 100
batch_size = 1
learning_rate = 0.001
num_epochs = 2
n_topics = 150
Optimal hyperparams we found for GRU: * hidden_size = 100 * batch_size = 1 * learning_rate = 0.005 * num_epochs = 2 * n_topics = 50
Train and Test Model
Defining Model Initialization
CONTENT
def init_content():# Initialize CONTENT with the hardcoded/grid-search hyperparam content_model = ContentModel(config).to(config.device) content_optimizer = torch.optim.Adam(content_model.parameters(), lr=config.learning_rate)return content_model, content_optimizer
def train_content(content_model, content_optimizer):# Record training losses content_model_losses = []# CONTENT model trainingprint("Training CONTENT model:")for epoch inrange(config.num_epochs): st = time.time() train_loss, train_theta_collector = train(content_model, train_loader, content_optimizer, config)# Append current training loss to list content_model_losses.append(train_loss)# Save theta outputs for the epoch train_thetas_arr = np.concatenate(train_theta_collector, axis=0) np.save(os.path.join(config.content_theta_dir, f"content_thetas_train_{epoch}.npy"), train_thetas_arr) elapsed = time.time() - stprint(f"\nEpoch {epoch+1}/{config.num_epochs} took {elapsed:.2f}s")print(f" [Train] loss={train_loss:.4f}")return content_model_losses
GRU
def train_gru(gru_model, gru_optimizer):# Record training losses gru_model_losses = []# GRU model trainingprint("Training GRU model:")for epoch inrange(config.num_epochs): st = time.time() train_loss, train_hidden_collector = train(gru_model, train_loader, gru_optimizer, config)# Append current training loss to list gru_model_losses.append(train_loss)# Save hidden outputs for the epoch train_hidden_arr = np.concatenate(train_hidden_collector, axis=0) np.save(os.path.join(config.gru_hiddens_dir, f"gru_hiddens_train_{epoch}.npy"), train_hidden_arr) elapsed = time.time() - stprint(f"\nEpoch {epoch+1}/{config.num_epochs} took {elapsed:.2f}s")print(f" [Train] loss={train_loss:.4f}")return gru_model_losses
Defining Plots
CONTENT
def plot_content_loss(content_model_losses):# Epochs for the x-axis epochs =range(1, config.num_epochs +1)# Plot plt.figure(figsize=(10, 6)) plt.plot(epochs, content_model_losses, label='CONTENT Model', marker='o') plt.xlabel('Epoch') plt.ylabel('Training Loss') plt.title('Training Loss over Epochs') plt.legend() plt.grid(True) plt.tight_layout() plt.show()
GRU
def plot_gru_loss(gru_model_losses):# Epochs for the x-axis epochs =range(1, config.num_epochs +1)# Plot plt.figure(figsize=(10, 6)) plt.plot(epochs, gru_model_losses, label='GRU Model', marker='o') plt.xlabel('Epoch') plt.ylabel('Training Loss') plt.title('Training Loss over Epochs') plt.legend() plt.grid(True) plt.tight_layout() plt.show()
Defining Testing
CONTENT
def test_content(content_model):# Evaluate CONTENT model on Testing Data content_test_loss, content_test_true, content_test_pred, content_test_theta_collector = evaluate_model(content_model, test_loader, config) content_test_metrics = compute_metrics(content_test_true, content_test_pred)# Save the CONTENT test results content_test_thetas_arr = np.concatenate(content_test_theta_collector, axis=0) np.save(os.path.join(config.content_theta_dir, f"content_thetas_test_final.npy"), content_test_thetas_arr) np.save(os.path.join(config.content_results_dir, f"content_test_labels_final.npy"), np.array(content_test_true)) np.save(os.path.join(config.content_results_dir, f"content_test_preds_final.npy"), np.array(content_test_pred))print(f"\n[CONTENT Test] loss={content_test_loss:.4f}, AUC={content_test_metrics['auc']:.4f}, "f"PR-AUC={content_test_metrics['prauc']:.4f}, ACC={content_test_metrics['acc']:.4f}, "f"Precision={content_test_metrics['precision']:.4f}, Recall={content_test_metrics['recall']:.4f}, "f"F1={content_test_metrics['f1']:.4f}")return content_test_metrics['auc'], content_test_metrics['prauc'], content_test_metrics['acc'], content_test_metrics['precision'], content_test_metrics['recall'], content_test_metrics['f1']
GRU
def test_gru(gru_model):# Evaluate GRU model on Testing Data gru_test_loss, gru_test_true, gru_test_pred, gru_test_hidden_collector = evaluate_model(gru_model, test_loader, config) gru_test_metrics = compute_metrics(gru_test_true, gru_test_pred)# Save the GRU test results gru_test_hidden_arr = np.concatenate(gru_test_hidden_collector, axis=0) np.save(os.path.join(config.gru_hiddens_dir, f"gru_hiddens_test_final.npy"), gru_test_hidden_arr) np.save(os.path.join(config.gru_results_dir, f"gru_test_labels_final.npy"), np.array(gru_test_true)) np.save(os.path.join(config.gru_results_dir, f"gru_test_preds_final.npy"), np.array(gru_test_pred))print(f"\n[GRU Test] loss={gru_test_loss:.4f}, AUC={gru_test_metrics['auc']:.4f}, "f"PR-AUC={gru_test_metrics['prauc']:.4f}, ACC={gru_test_metrics['acc']:.4f}, "f"Precision={gru_test_metrics['precision']:.4f}, Recall={gru_test_metrics['recall']:.4f}, "f"F1={gru_test_metrics['f1']:.4f}")return gru_test_metrics['auc'], gru_test_metrics['prauc'], gru_test_metrics['acc'], gru_test_metrics['precision'], gru_test_metrics['recall'], gru_test_metrics['f1']
Executing Models
CONTENT
# Optional, confirm which params you are usingprint(f"Embed Size: {config.embed_size}")print(f"Hidden Size: {config.hidden_size}")print(f"Batch Size: {config.batch_size}")print(f"Learning Rate: {config.learning_rate}")print(f"Num Topics: {config.n_topics}")print(f"Num Epochs: {config.num_epochs}")print(f"Num Trials: {config.num_trials}")
Embed Size: 100
Hidden Size: 200
Batch Size: 1
Learning Rate: 0.001
Num Topics: 50
Num Epochs: 6
Num Trials: 10
def run_content(trial, seed=None):print(f"-----TRIAL {trial}:-----")# Use new seedif seed isnotNone: torch.manual_seed(seed) np.random.seed(seed)# Brand‑new model & optimiser content_model, content_optimizer = init_content()# Optional, but part of original CONTENT: word2vec embedding initialization content_model = word2vec(content_model)# Train it content_model_losses = train_content(content_model, content_optimizer)# Optional: plot training loss# plot_content_loss(content_model_losses)# Evaluate and collect metrics metrics = test_content(content_model)print("------------------\n")return metrics# Repeatcontent_metrics = [run_content(i +1, seed=i) for i inrange(config.num_trials)]content_aucs, content_praucs, content_accs, \content_precisions, content_recalls, content_f1s =map(list, zip(*content_metrics))print("\n[CONTENT RESULTS OVER TRIALS] "f"AUC = {round(mean(content_aucs), 4)} +/- {round(stdev(content_aucs), 4)}, "f"PRAUC = {round(mean(content_praucs), 4)} +/- {round(stdev(content_praucs), 4)}, "f"ACC = {round(mean(content_accs), 4)} +/- {round(stdev(content_accs), 4)}, "f"PRECISION = {round(mean(content_precisions), 4)} +/- {round(stdev(content_precisions), 4)}, "f"RECALL = {round(mean(content_recalls), 4)} +/- {round(stdev(content_recalls), 4)}, "f"F1 = {round(mean(content_f1s), 4)} +/- {round(stdev(content_f1s), 4)}")
-----TRIAL 1:-----
Training CONTENT model:
Epoch 1/6 took 23.83s
[Train] loss=0.4170
Epoch 2/6 took 30.39s
[Train] loss=0.4002
Epoch 3/6 took 25.83s
[Train] loss=0.3916
Epoch 4/6 took 28.47s
[Train] loss=0.3801
Epoch 5/6 took 26.46s
[Train] loss=0.3663
Epoch 6/6 took 27.75s
[Train] loss=0.3507
[CONTENT Test] loss=0.3996, AUC=0.7932, PR-AUC=0.6387, ACC=0.8353, Precision=0.7318, Recall=0.4169, F1=0.5312
------------------
-----TRIAL 2:-----
Training CONTENT model:
Epoch 1/6 took 23.10s
[Train] loss=0.4167
Epoch 2/6 took 26.40s
[Train] loss=0.4010
Epoch 3/6 took 22.44s
[Train] loss=0.3908
Epoch 4/6 took 21.56s
[Train] loss=0.3812
Epoch 5/6 took 27.95s
[Train] loss=0.3669
Epoch 6/6 took 25.63s
[Train] loss=0.3515
[CONTENT Test] loss=0.4081, AUC=0.7911, PR-AUC=0.6350, ACC=0.8344, Precision=0.7915, Recall=0.3534, F1=0.4886
------------------
-----TRIAL 3:-----
Training CONTENT model:
Epoch 1/6 took 31.39s
[Train] loss=0.4162
Epoch 2/6 took 28.93s
[Train] loss=0.4004
Epoch 3/6 took 26.25s
[Train] loss=0.3916
Epoch 4/6 took 22.70s
[Train] loss=0.3804
Epoch 5/6 took 21.61s
[Train] loss=0.3668
Epoch 6/6 took 22.42s
[Train] loss=0.3515
[CONTENT Test] loss=0.4061, AUC=0.7912, PR-AUC=0.6356, ACC=0.8316, Precision=0.6924, Recall=0.4455, F1=0.5421
------------------
-----TRIAL 4:-----
Training CONTENT model:
Epoch 1/6 took 22.17s
[Train] loss=0.4166
Epoch 2/6 took 25.83s
[Train] loss=0.4010
Epoch 3/6 took 22.90s
[Train] loss=0.3916
Epoch 4/6 took 21.58s
[Train] loss=0.3819
Epoch 5/6 took 22.48s
[Train] loss=0.3675
Epoch 6/6 took 22.46s
[Train] loss=0.3525
[CONTENT Test] loss=0.4100, AUC=0.7932, PR-AUC=0.6377, ACC=0.8269, Precision=0.6504, Recall=0.4894, F1=0.5585
------------------
-----TRIAL 5:-----
Training CONTENT model:
Epoch 1/6 took 22.08s
[Train] loss=0.4170
Epoch 2/6 took 21.50s
[Train] loss=0.4011
Epoch 3/6 took 22.23s
[Train] loss=0.3913
Epoch 4/6 took 22.36s
[Train] loss=0.3815
Epoch 5/6 took 22.21s
[Train] loss=0.3679
Epoch 6/6 took 21.83s
[Train] loss=0.3529
[CONTENT Test] loss=0.4073, AUC=0.7944, PR-AUC=0.6411, ACC=0.8297, Precision=0.6688, Recall=0.4735, F1=0.5544
------------------
-----TRIAL 6:-----
Training CONTENT model:
Epoch 1/6 took 21.87s
[Train] loss=0.4166
Epoch 2/6 took 22.28s
[Train] loss=0.4003
Epoch 3/6 took 22.25s
[Train] loss=0.3916
Epoch 4/6 took 21.77s
[Train] loss=0.3804
Epoch 5/6 took 22.29s
[Train] loss=0.3672
Epoch 6/6 took 22.40s
[Train] loss=0.3518
[CONTENT Test] loss=0.4027, AUC=0.7925, PR-AUC=0.6369, ACC=0.8348, Precision=0.7276, Recall=0.4183, F1=0.5312
------------------
-----TRIAL 7:-----
Training CONTENT model:
Epoch 1/6 took 22.19s
[Train] loss=0.4181
Epoch 2/6 took 21.43s
[Train] loss=0.4000
Epoch 3/6 took 22.28s
[Train] loss=0.3910
Epoch 4/6 took 22.37s
[Train] loss=0.3807
Epoch 5/6 took 21.79s
[Train] loss=0.3676
Epoch 6/6 took 22.21s
[Train] loss=0.3504
[CONTENT Test] loss=0.4051, AUC=0.7937, PR-AUC=0.6378, ACC=0.8317, Precision=0.6849, Recall=0.4595, F1=0.5500
------------------
-----TRIAL 8:-----
Training CONTENT model:
Epoch 1/6 took 22.10s
[Train] loss=0.4171
Epoch 2/6 took 22.26s
[Train] loss=0.4001
Epoch 3/6 took 21.78s
[Train] loss=0.3914
Epoch 4/6 took 22.00s
[Train] loss=0.3799
Epoch 5/6 took 22.38s
[Train] loss=0.3659
Epoch 6/6 took 22.61s
[Train] loss=0.3496
[CONTENT Test] loss=0.4092, AUC=0.7902, PR-AUC=0.6312, ACC=0.8313, Precision=0.7051, Recall=0.4233, F1=0.5290
------------------
-----TRIAL 9:-----
Training CONTENT model:
Epoch 1/6 took 21.84s
[Train] loss=0.4165
Epoch 2/6 took 21.82s
[Train] loss=0.4001
Epoch 3/6 took 22.27s
[Train] loss=0.3918
Epoch 4/6 took 22.41s
[Train] loss=0.3821
Epoch 5/6 took 21.64s
[Train] loss=0.3695
Epoch 6/6 took 22.35s
[Train] loss=0.3547
[CONTENT Test] loss=0.4087, AUC=0.7935, PR-AUC=0.6367, ACC=0.8309, Precision=0.6759, Recall=0.4693, F1=0.5540
------------------
-----TRIAL 10:-----
Training CONTENT model:
Epoch 1/6 took 22.08s
[Train] loss=0.4174
Epoch 2/6 took 22.25s
[Train] loss=0.4001
Epoch 3/6 took 21.49s
[Train] loss=0.3922
Epoch 4/6 took 22.24s
[Train] loss=0.3803
Epoch 5/6 took 22.45s
[Train] loss=0.3676
Epoch 6/6 took 22.21s
[Train] loss=0.3530
[CONTENT Test] loss=0.3995, AUC=0.7961, PR-AUC=0.6377, ACC=0.8348, Precision=0.7292, Recall=0.4163, F1=0.5300
------------------
[CONTENT RESULTS OVER TRIALS] AUC = 0.7929 +/- 0.0017, PRAUC = 0.6368 +/- 0.0026, ACC = 0.8321 +/- 0.0027, PRECISION = 0.7058 +/- 0.0408, RECALL = 0.4365 +/- 0.0395, F1 = 0.5369 +/- 0.0205