Config and Imports

Mount Google Drive

# View and modify the working path
import os
from google.colab import drive

# View current working directory
print("Current Working *Directory*:", os.getcwd())

# Mount Google Drive
drive.mount('/content/gdrive')

# Change working directory to your file position
path = "/content/gdrive/My Drive/BD4H-project-main/code"
os.chdir(path)

# Confirm the change
print("Working Directory:", os.getcwd())
Current Working *Directory*: /content
Mounted at /content/gdrive
Working Directory: /content/gdrive/My Drive/BD4H-project/code

Initialize Paper (Xiao et al., 2018) Parameters

TAs - for the purpose of faster runtime, we are intentionally setting the parameters “num_epochs” to 2, and “num_trials” to 2, even though Xiao et al. use 6 and 10, respectfully. To replicate the original paper, evaluation outputs you see at the end (See “Executing Models” Section) are from using the original 6 epoch / 10 trial parameters.

import os
import pickle
import time
import numpy as np
import pandas as pd
from statistics import mean, stdev

import torch
import torch.nn as nn

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import (
    precision_recall_fscore_support,
    roc_auc_score,
    accuracy_score,
    precision_recall_curve,
)
from sklearn.metrics import average_precision_score as pr_auc
from sklearn.cluster import MiniBatchKMeans
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import zipfile

class Config:
    """
    Holds hyperparameters, file paths, and general settings.
    In practice, you could store these in a YAML/JSON file.
    """
    # Data paths
    dataset_dir = "../resource"
    zipped_file = os.path.join(dataset_dir, "S1_Data.zip")
    input_file  = os.path.join(dataset_dir, "S1_Data.txt")  # after unzipping


    vocab_file = os.path.join(dataset_dir, "vocab.txt")
    stop_file = os.path.join(dataset_dir, "stop.txt")
    vocab_pkl = os.path.join(dataset_dir, "vocab.pkl")

    # PKLs for train, valid, test data
    pkl_train_x = os.path.join(dataset_dir, "X_train.pkl")
    pkl_train_y = os.path.join(dataset_dir, "Y_train.pkl")
    pkl_val_x   = os.path.join(dataset_dir, "X_valid.pkl")
    pkl_val_y   = os.path.join(dataset_dir, "Y_valid.pkl")
    pkl_test_x  = os.path.join(dataset_dir, "X_test.pkl")
    pkl_test_y  = os.path.join(dataset_dir, "Y_test.pkl")

    # For building the vocab
    rare_word_threshold = 100
    stop_word_threshold = 1e4

    unknown_index = 1
    vocab_size = 490
    n_stops = 12  # last 12 are considered "stop words"
    n_topics = 50
    max_visit_len = 300

    # Model hyperparams
    embed_size = 100
    hidden_size = 200

    # Training settings
    batch_size = 1
    grad_clip = 100
    learning_rate = 0.001
    num_epochs = 2
    num_trials = 2

    # Where to save intermediate outputs
    content_theta_dir = "../output/CONTENT_theta"
    gru_hiddens_dir = "../output/GRU_hiddens"
    content_results_dir = "../output/CONTENT_results"
    gru_results_dir = "../output/GRU_results"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Config] Device: {device}")
[Config] Device: cuda

Execute Config setup

config = Config()

os.makedirs(config.content_theta_dir, exist_ok=True)
os.makedirs(config.gru_hiddens_dir, exist_ok=True)
os.makedirs(config.content_results_dir, exist_ok=True)
os.makedirs(config.gru_results_dir, exist_ok=True)

Data Setup

Utility Helpers (Pickle, Numpy, Vocab, etc.)

def save_pkl(path, obj):
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    print(f"Saved: {path}")

def load_pkl(path):
    with open(path, "rb") as f:
        obj = pickle.load(f)
    print(f"Loaded: {path}")
    return obj

def save_npy(path, arr):
    np.save(path, arr)
    print(f"Saved: {path}")

def load_npy(path):
    arr = np.load(path, allow_pickle=True)
    print(f"Loaded: {path}")
    return arr

def ensure_data_unzipped(config: Config):
    """
    Checks if the unzipped S1_Data.txt exists. If not, unzips S1_Data.zip.
    Sets config.input_file to the unzipped file.
    """
    if os.path.exists(config.input_file):
        print(f"S1_Data.txt already unzipped at: {config.input_file}")
        return
    with zipfile.ZipFile(config.zipped_file, 'r') as zip_ref:
        zip_ref.extractall(config.dataset_dir)
    print(f"Unzipped {config.zipped_file} => {config.dataset_dir}")

def build_vocab(config: Config):
    """
    Creates vocab.txt and stop.txt from S1_Data.txt by filtering.
    Index offset so that 'unknown_index' can be used.
    """
    df = pd.read_csv(config.input_file, sep="\t", header=0)
    grouped = df.groupby("DX_GROUP_DESCRIPTION").size().reset_index(name="SIZE")

    # Filter out rare
    grouped = grouped[grouped["SIZE"] > config.rare_word_threshold]

    # Sort by frequency ascending
    grouped = grouped.sort_values(by="SIZE").reset_index(drop=True)
    vocab = grouped["DX_GROUP_DESCRIPTION"]
    vocab.index += 2  # offset => index=1 is reserved for unknown

    vocab.to_csv(config.vocab_file, sep="\t", header=False, index=True)
    print("Number of valid tokens:", len(vocab))

    # Stop words => extremely frequent
    stops = grouped[grouped["SIZE"] > config.stop_word_threshold]
    stops["DX_GROUP_DESCRIPTION"].to_csv(config.stop_file, sep="\t", header=False, index=False)

def load_vocab_dict(config: Config):
    """
    Reads vocab_file => returns {word: index}, also pickles reverse mapping.
    """
    word_to_index = {}
    with open(config.vocab_file, "r") as f:
        for line in f:
            idx_str, token = line.strip().split("\t")
            word_to_index[token] = int(idx_str) - 1

    reverse_mapping = {v: k for k, v in word_to_index.items()}
    save_pkl(config.vocab_pkl, reverse_mapping)
    print(f"Vocab size: {len(word_to_index)}")
    return word_to_index

def loadEmbeddingMatrix(wordvecFile, word_to_index, vocab_size):
  # Build reverse mapping: index -> word
  with open(wordvecFile, "r") as fw:
      # Read the header: total number of words and embedding dimension
      header = fw.readline().strip().split()
      total, dim = int(header[0]), int(header[1])
      # Initialize the embedding matrix with zeros
      W = np.zeros((vocab_size, dim), dtype=np.float32)
      for line in fw:
          parts = line.strip().split()
          # Reconstruct the word (in case it contains spaces)
          word = " ".join(parts[:-dim])
          vec = np.array(parts[-dim:], dtype=np.float32)
          try:
              token_value = word_to_index[word]  # Get the token index from the reverse mapping
          except KeyError:
              print(f"{word} is not in vocabulary; skipping.")
              continue
          # Adjust index if your vocab mapping is 1-indexed; here we subtract 1.
          W[token_value - 1] = vec
  return W

Data Preprocessing and Splitting

def extract_inpatient_events(config: Config):
    """
    Extracts 'INPATIENT HOSPITAL' events => used to mark readmission (1) if next event is within 30 days.
    """
    df = pd.read_csv(config.input_file, sep="\t", header=0)
    inpat = df[df["SERVICE_LOCATION"] == "INPATIENT HOSPITAL"]
    grouped = (inpat.groupby(["PID", "DAY_ID", "SERVICE_LOCATION"])
                    .size()
                    .reset_index(name="COUNT")
                    .sort_values(["PID", "DAY_ID"], ascending=True)
                    .set_index("PID"))
    return grouped

def convert_format(config: Config, word2idx, inpat_df):
    """
    Goes through S1_Data.txt line by line => builds docs & labels.
    docs[i] = [ [codes_on_day1], [codes_on_day2], ... ]
    labels[i] = [0/1 for each day]
    """
    def is_readmitted(pid, day):
        try:
            recs = inpat_df.loc[int(pid)]
            if isinstance(recs, pd.Series):
                return (int(day) <= recs.DAY_ID < int(day) + 30)
            # Else a sub-DataFrame
            subset = recs.loc[(int(day) <= recs.DAY_ID) & (recs.DAY_ID < int(day) + 30)]
            return subset.shape[0] > 0
        except KeyError:
            return False

    docs, labels = [], []
    with open(config.input_file, "r") as f:
        header = f.readline().strip().split("\t")
        col_idx = {h: i for i, h in enumerate(header)}

        doc, visit_codes, label_seq = [], [], []
        line = f.readline()
        if not line:
            return docs, labels

        tokens = line.strip().split("\t")
        pid, day_id = tokens[col_idx["PID"]], tokens[col_idx["DAY_ID"]]
        label_seq.append(1 if is_readmitted(pid, day_id) else 0)

        while line:
            tokens = line.strip().split("\t")
            c_pid, c_day = tokens[col_idx["PID"]], tokens[col_idx["DAY_ID"]]

            if c_pid != pid:
                doc.append(visit_codes)
                docs.append(doc)
                labels.append(label_seq)
                # Reset
                doc, visit_codes, label_seq = [], [], []
                pid, day_id = c_pid, c_day
                label_seq.append(1 if is_readmitted(pid, day_id) else 0)
            else:
                # Same patient, check if new day
                if c_day != day_id:
                    doc.append(visit_codes)
                    visit_codes = []
                    day_id = c_day
                    label_seq.append(1 if is_readmitted(pid, day_id) else 0)

            diag_str = tokens[col_idx["DX_GROUP_DESCRIPTION"]]
            diag_idx = word2idx.get(diag_str, config.unknown_index)
            visit_codes.append(diag_idx)
            line = f.readline()

        # Finalize
        doc.append(visit_codes)
        docs.append(doc)
        labels.append(label_seq)

    return docs, labels

def split_and_save(config: Config, docs, labels):
    """
    Splits docs/labels => train/valid/test and saves as pkl.
    """
    # Adjust these splits as needed. We match the authors
    train_end = 2000
    val_end = 2500

    save_pkl(config.pkl_train_x, docs[:train_end])
    save_pkl(config.pkl_train_y, labels[:train_end])

    save_pkl(config.pkl_val_x, docs[train_end:val_end])
    save_pkl(config.pkl_val_y, labels[train_end:val_end])

    save_pkl(config.pkl_test_x, docs[val_end:])
    save_pkl(config.pkl_test_y, labels[val_end:])

PyTorch Dataset & DataLoader

class PatientVisitsDataset(Dataset):
    """
    A PyTorch Dataset that holds (docs, labels) for a split (train/valid/test).
    We'll convert them to multi-hot within __getitem__ or in a collate_fn.
    """
    def __init__(self, docs, labels, vocab_size, max_len):
        super().__init__()
        self.docs = docs
        self.labels = labels
        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, idx):
        return self.docs[idx], self.labels[idx]

def multi_hot_collate_fn(batch, vocab_size, max_len):
    """
    Collate function to transform a list of (doc, label) => (x, y, mask).
    Each doc is a list of visits.
    We'll multi-hot each visit, padding each sample to a fixed length (max_len).
    """
    batch_size = len(batch)

    # 1) Separate docs and labels.
    docs = [b[0] for b in batch]
    labels = [b[1] for b in batch]

    # 2) Enforce each sample to have exactly max_len visits (truncating if longer, padding with zeros if shorter).
    x_array = np.zeros((batch_size, max_len, vocab_size), dtype=np.float32)
    y_array = np.ones((batch_size, max_len), dtype=np.float32)
    mask_array = np.zeros((batch_size, max_len), dtype=np.float32)

    for i, (doc, lab) in enumerate(zip(docs, labels)):
        # Use the minimum between the number of visits in the doc and max_len.
        seq_len = min(len(doc), max_len)
        mask_array[i, :seq_len] = 1
        y_array[i, :seq_len] = lab[:seq_len]
        for j in range(seq_len):
            visit_codes = doc[j]
            for code_idx in visit_codes:
                # Adjust for 0-indexed coding; your vocabulary indices seem to be 1-indexed.
                x_array[i, j, code_idx - 1] = 1

    # Convert numpy arrays to torch tensors.
    x_tensor = torch.from_numpy(x_array)
    y_tensor = torch.from_numpy(y_array)
    mask_tensor = torch.from_numpy(mask_array)
    return x_tensor, y_tensor, mask_tensor


def create_dataloader(docs, labels, config: Config, shuffle=False):
    """
    Convenience method to build a DataLoader from docs/labels.
    """
    dataset = PatientVisitsDataset(docs, labels, config.vocab_size, config.max_visit_len)
    loader = DataLoader(
        dataset,
        batch_size=config.batch_size,
        shuffle=shuffle,
        collate_fn=lambda b: multi_hot_collate_fn(b, config.vocab_size, config.max_visit_len),
    )
    return loader

Executing Data Prep

You can comment out 1 and 2 if already run - loads existing data.

# 1) Build vocab & Load it
ensure_data_unzipped(config)
build_vocab(config)
w2i = load_vocab_dict(config)

# 2) Extract events => docs => split => pkl
inpat_events = extract_inpatient_events(config)
docs, labels = convert_format(config, w2i, inpat_events)
split_and_save(config, docs, labels)

# 3) Load those splits into memory
X_train = load_pkl(config.pkl_train_x)
Y_train = load_pkl(config.pkl_train_y)

X_valid = load_pkl(config.pkl_val_x)
Y_valid = load_pkl(config.pkl_val_y)

X_test  = load_pkl(config.pkl_test_x)
Y_test  = load_pkl(config.pkl_test_y)

# 4) Create DataLoaders
train_loader = create_dataloader(X_train, Y_train, config, shuffle=True)
valid_loader = create_dataloader(X_valid, Y_valid, config, shuffle=False)
test_loader  = create_dataloader(X_test,  Y_test,  config, shuffle=False)
S1_Data.txt already unzipped at: ../resource/S1_Data.txt
Number of valid tokens: 490
Saved: ../resource/vocab.pkl
Vocab size: 490
Saved: ../resource/X_train.pkl
Saved: ../resource/Y_train.pkl
Saved: ../resource/X_valid.pkl
Saved: ../resource/Y_valid.pkl
Saved: ../resource/X_test.pkl
Saved: ../resource/Y_test.pkl
Loaded: ../resource/X_train.pkl
Loaded: ../resource/Y_train.pkl
Loaded: ../resource/X_valid.pkl
Loaded: ../resource/Y_valid.pkl
Loaded: ../resource/X_test.pkl
Loaded: ../resource/Y_test.pkl

Executing Label Distribution

label_distribution = {1: 0, 0: 0}
for l in labels:
  for num in l:
    label_distribution[num] += 1
print("Label 1 has {} counts, and Label 0 has {} counts".format(label_distribution[1], label_distribution[0]))
Label 1 has 54427 counts, and Label 0 has 185509 counts

Defining CONTENT Model

class ThetaLayer(nn.Module):
    """
    Reparameterization for the topic distribution + KL term.
    """
    def __init__(self, max_len, n_topics):
        super().__init__()
        self.max_len = max_len
        self.kl_term = 0.0
        self.theta = None
        self.n_topics = n_topics

    def forward(self, mu, log_sigma):
        eps = torch.randn_like(mu)
        z = mu + torch.exp(0.5 * log_sigma) * eps
        theta = F.softmax(z, dim=1)
        self.theta = theta

        # kl
        self.kl_term = -0.5 * torch.sum(1 + log_sigma - mu.pow(2) - log_sigma.exp())

        self.batch_size_flat = mu.size(0)

        # Expand => [batch_size, seq_len, n_topics]
        expanded_theta = theta.unsqueeze(1).expand(-1, self.max_len, self.n_topics)
        return expanded_theta

        return expanded_theta


class ContentModel(nn.Module):
    """
    RNN + Topic model:
      1) embed => GRU
      2) dense => mu, log_sigma => Theta
      3) B * Theta => context
      4) sum => final prediction
    """
    def __init__(self, config: Config):
        super().__init__()
        self.vocab_size = config.vocab_size
        self.embed_size = config.embed_size
        self.hidden_size= config.hidden_size
        self.n_topics   = config.n_topics
        self.max_len    = config.max_visit_len

        self.embed = nn.Linear(self.vocab_size, self.embed_size, bias=False)
        self.gru = nn.GRU(self.embed_size, self.hidden_size, batch_first=True)

        self.dense1 = nn.Linear(self.vocab_size, self.hidden_size)
        self.dense2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.mu = nn.Linear(self.hidden_size, self.n_topics)
        self.log_sigma = nn.Linear(self.hidden_size, self.n_topics)

        self.B = nn.Linear(self.vocab_size, self.n_topics, bias=False)
        self.out_layer = nn.Linear(self.hidden_size, 1)

        self.theta_layer = ThetaLayer(self.max_len, self.n_topics)

    def forward(self, x, mask):
        B, T, _ = x.shape

        # x => [batch, seq_len, vocab_size]
        embedded = self.embed(x) * mask.unsqueeze(-1)  # [B, T, embed_size]
        gru_out, h_n = self.gru(embedded)

        # Topic
        h1 = F.relu(self.dense1(x))  # [B, T, hidden]
        h2 = F.relu(self.dense2(h1)) # [B, T, hidden]

        # Cumulative mean up to each time‑step
        seq_lens = mask.sum(dim=1, keepdim=True)             # [B,1]
        h2_mean  = (h2.sum(dim=1) / (seq_lens + 1e-8))       # [B,H]

        mu        = self.mu(h2_mean)                         # [B,K]
        log_sigma = self.log_sigma(h2_mean)                     # [B,K]

        theta = self.theta_layer(mu, log_sigma)              # [B,T,K]

        # Context
        context = (self.B(x) * theta).mean(dim=-1)           # mean over K  ➜ [B,T]
        rnn_scores = self.out_layer(gru_out).squeeze(-1)       # [B,T]

        logits = rnn_scores + context                        # additive fusion
        probs  = torch.sigmoid(logits)                       # [B,T]

        # apply mask after sigmoid & keep tiny ε everywhere
        probs = probs * mask + 1e-6

        return probs, h_n, theta

    @property
    def kl_term(self):
      return self.theta_layer.kl_term / (self.theta_layer.max_len + 1e-8)

Defining GRU Model

class GRUModel(nn.Module):
    """
    A basic GRU model without word embeddings or topic modeling:
      1) direct input => GRU
      2) GRU output => prediction
    """
    def __init__(self, config: Config):
        super().__init__()
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size

        # Direct GRU without embedding layer
        self.gru = nn.GRU(self.vocab_size, self.hidden_size, batch_first=True)

        # Output layer
        self.out_layer = nn.Linear(self.hidden_size, 1)

        # For compatibility with ContentModel API
        self.kl_term = 0.0

    def forward(self, x, mask):
        # x => [batch, seq_len, vocab_size]
        # Apply mask to input
        masked_input = x * mask.unsqueeze(-1)  # [B, T, vocab_size]

        # Pass through GRU
        gru_out, h_n = self.gru(masked_input)

        # Generate scores
        logits = self.out_layer(gru_out).squeeze(-1)  # [B, T]

        # Apply sigmoid and mask
        out = torch.sigmoid(logits)
        out = out * mask  # zeros for padded positions
        out = torch.clamp(out, min=1e-6, max=1 - 1e-6)  # ensure strictly in (0,1)

        # For compatibility with ContentModel API, return None as theta
        # Ensures compatibility with train() and evaluate_model()
        return out, h_n, None

Defining Training Function

def train(model, loader, optimizer, config: Config):
    """
    Train step over 'loader' => returns average train loss, plus any collected [theta+hidden].
    """
    model.train()
    total_loss = 0.
    batch_count= 0
    collector  = []

    for x_batch, y_batch, m_batch in loader:
        x_batch = x_batch.to(config.device)
        y_batch = y_batch.to(config.device)
        m_batch = m_batch.to(config.device)

        optimizer.zero_grad()

        # Handle different model types
        if isinstance(model, ContentModel):
            preds, h_n, theta = model(x_batch, m_batch)
            bce = F.binary_cross_entropy(preds, y_batch, reduction='none')
            bce = (bce * m_batch).sum() / m_batch.sum()    # mean over real visits
            loss = bce + model.kl_term               # add KL term from model

            # Store [theta + hidden]
            rnn_vec  = h_n.squeeze(0).detach().cpu().numpy()     # [B, H]
            theta_np = theta[:, 0, :].detach().cpu().numpy() # [B, K]  ← collapse T
            combined = np.concatenate([theta_np, rnn_vec], axis=1)  # [B, K+H]
            collector.append(combined)

        elif isinstance(model, GRUModel):
            preds, h_n, _ = model(x_batch, m_batch)
            bce = F.binary_cross_entropy(preds, y_batch, reduction='none')
            bce = (bce * m_batch).sum() / m_batch.sum()    # mean over real visits
            loss = bce

            # Store just the hidden for GRU
            rnn_vec = h_n.squeeze(0).detach().cpu().numpy()
            collector.append(rnn_vec)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
        optimizer.step()

        total_loss += loss.item()
        batch_count+= 1

    avg_loss = total_loss / max(batch_count,1)
    return avg_loss, collector

Defining Evaluation Functions

def evaluate_model(model, loader, config: Config):
    """
    Evaluates on a DataLoader (e.g. test/valid).
    Returns (avg_loss, list_of_true, list_of_pred, theta_hidden_collector).
    """
    model.eval()
    total_loss = 0.0
    batch_count= 0
    all_true, all_pred = [], []
    all_theta_hidden = []

    with torch.no_grad():
        for x_batch, y_batch, mask_batch in loader:
            x_batch = x_batch.to(config.device)
            y_batch = y_batch.to(config.device)
            mask_batch = mask_batch.to(config.device)

            if isinstance(model, ContentModel):
                preds, h_n, theta = model(x_batch, mask_batch)
                bce = F.binary_cross_entropy(preds, y_batch, reduction='none')
                bce = (bce * mask_batch).sum() / mask_batch.sum()    # mean over real visits
                loss = bce + model.kl_term               # add KL term from model

                # Store [theta + hidden]
                rnn_vec  = h_n.squeeze(0).detach().cpu().numpy()     # [B, H]
                theta_np = theta[:, 0, :].detach().cpu().numpy() # [B, K]  ← collapse T
                combined = np.concatenate([theta_np, rnn_vec], axis=1)  # [B, K+H]
                all_theta_hidden.append(combined)

            elif isinstance(model, GRUModel):
                preds, h_n, _ = model(x_batch, mask_batch)
                bce = F.binary_cross_entropy(preds, y_batch, reduction='none')
                bce = (bce * mask_batch).sum() / mask_batch.sum()    # mean over real visits
                loss = bce

                # Store just the hidden for GRU
                rnn_vec = h_n.squeeze(0).detach().cpu().numpy()
                all_theta_hidden.append(rnn_vec)

            total_loss += loss.item()
            batch_count += 1

            # flatten predictions/labels ignoring masked positions
            seq_lens = mask_batch.sum(dim=1).cpu().numpy().astype(int)
            preds_np = preds.detach().cpu().numpy()
            y_np     = y_batch.detach().cpu().numpy()

            for i in range(x_batch.shape[0]):
                length_i = seq_lens[i]
                all_pred.extend(preds_np[i, :length_i])
                all_true.extend(y_np[i, :length_i])

    avg_loss = total_loss / (batch_count if batch_count else 1)
    return avg_loss, all_true, all_pred, all_theta_hidden


def compute_metrics(true_vals, pred_vals):
    """
    Returns a dict with AUC, PR-AUC, ACC, Precision, Recall, F1.
    """
    auc_val = roc_auc_score(true_vals, pred_vals)
    pr_val  = pr_auc(true_vals, pred_vals)
    preds_bin = (np.array(pred_vals) > 0.5).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(true_vals, preds_bin, average="binary")
    acc_val = accuracy_score(true_vals, preds_bin)
    return {
        "auc": auc_val,
        "prauc": pr_val,
        "acc": acc_val,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }

Train and Test Model

Defining Model Initialization

CONTENT

def init_content():
  # Initialize CONTENT with the hardcoded/grid-search hyperparam
  content_model = ContentModel(config).to(config.device)
  content_optimizer = torch.optim.Adam(content_model.parameters(), lr=config.learning_rate)
  return content_model, content_optimizer

GRU

def init_gru():
  # GRU model
  gru_model = GRUModel(config).to(config.device)
  gru_optimizer = torch.optim.Adam(gru_model.parameters(), lr=config.learning_rate)
  return gru_model, gru_optimizer

Word2Vec Embedding (CONTENT model only)

# Initialize weights with embedded matrix (CONTENT model only)
wordvecPath = os.path.join(config.dataset_dir, "word2vec.vector")
vocab_mapping = load_vocab_dict(config)
W_embed = loadEmbeddingMatrix(wordvecPath, vocab_mapping, config.vocab_size)

def word2vec(content_model):
  with torch.no_grad():
      content_model.embed.weight.copy_(torch.from_numpy(W_embed.T).to(config.device))
  return content_model
Saved: ../resource/vocab.pkl
Vocab size: 490

Defining Training

CONTENT

def train_content(content_model, content_optimizer):
  # Record training losses
  content_model_losses = []

  # CONTENT model training
  print("Training CONTENT model:")
  for epoch in range(config.num_epochs):
    st = time.time()
    train_loss, train_theta_collector = train(content_model, train_loader, content_optimizer, config)

    # Append current training loss to list
    content_model_losses.append(train_loss)

    # Save theta outputs for the epoch
    train_thetas_arr = np.concatenate(train_theta_collector, axis=0)
    np.save(os.path.join(config.content_theta_dir, f"content_thetas_train_{epoch}.npy"), train_thetas_arr)

    elapsed = time.time() - st
    print(f"\nEpoch {epoch+1}/{config.num_epochs} took {elapsed:.2f}s")
    print(f"  [Train] loss={train_loss:.4f}")

  return content_model_losses

GRU

def train_gru(gru_model, gru_optimizer):
  # Record training losses
  gru_model_losses = []

  # GRU model training
  print("Training GRU model:")
  for epoch in range(config.num_epochs):
    st = time.time()
    train_loss, train_hidden_collector = train(gru_model, train_loader, gru_optimizer, config)

    # Append current training loss to list
    gru_model_losses.append(train_loss)

    # Save hidden outputs for the epoch
    train_hidden_arr = np.concatenate(train_hidden_collector, axis=0)
    np.save(os.path.join(config.gru_hiddens_dir, f"gru_hiddens_train_{epoch}.npy"), train_hidden_arr)

    elapsed = time.time() - st
    print(f"\nEpoch {epoch+1}/{config.num_epochs} took {elapsed:.2f}s")
    print(f"  [Train] loss={train_loss:.4f}")

  return gru_model_losses

Defining Plots

CONTENT

def plot_content_loss(content_model_losses):
  # Epochs for the x-axis
  epochs = range(1, config.num_epochs + 1)

  # Plot
  plt.figure(figsize=(10, 6))
  plt.plot(epochs, content_model_losses, label='CONTENT Model', marker='o')

  plt.xlabel('Epoch')
  plt.ylabel('Training Loss')
  plt.title('Training Loss over Epochs')
  plt.legend()
  plt.grid(True)
  plt.tight_layout()

  plt.show()

GRU

def plot_gru_loss(gru_model_losses):
  # Epochs for the x-axis
  epochs = range(1, config.num_epochs + 1)

  # Plot
  plt.figure(figsize=(10, 6))
  plt.plot(epochs, gru_model_losses, label='GRU Model', marker='o')

  plt.xlabel('Epoch')
  plt.ylabel('Training Loss')
  plt.title('Training Loss over Epochs')
  plt.legend()
  plt.grid(True)
  plt.tight_layout()

  plt.show()

Defining Testing

CONTENT

def test_content(content_model):
  # Evaluate CONTENT model on Testing Data
  content_test_loss, content_test_true, content_test_pred, content_test_theta_collector = evaluate_model(content_model, test_loader, config)
  content_test_metrics = compute_metrics(content_test_true, content_test_pred)

  # Save the CONTENT test results
  content_test_thetas_arr = np.concatenate(content_test_theta_collector, axis=0)
  np.save(os.path.join(config.content_theta_dir, f"content_thetas_test_final.npy"), content_test_thetas_arr)
  np.save(os.path.join(config.content_results_dir, f"content_test_labels_final.npy"), np.array(content_test_true))
  np.save(os.path.join(config.content_results_dir, f"content_test_preds_final.npy"), np.array(content_test_pred))

  print(f"\n[CONTENT Test] loss={content_test_loss:.4f}, AUC={content_test_metrics['auc']:.4f}, "
        f"PR-AUC={content_test_metrics['prauc']:.4f}, ACC={content_test_metrics['acc']:.4f}, "
        f"Precision={content_test_metrics['precision']:.4f}, Recall={content_test_metrics['recall']:.4f}, "
        f"F1={content_test_metrics['f1']:.4f}")

  return content_test_metrics['auc'], content_test_metrics['prauc'], content_test_metrics['acc'], content_test_metrics['precision'], content_test_metrics['recall'], content_test_metrics['f1']

GRU

def test_gru(gru_model):
  # Evaluate GRU model on Testing Data
  gru_test_loss, gru_test_true, gru_test_pred, gru_test_hidden_collector = evaluate_model(gru_model, test_loader, config)
  gru_test_metrics = compute_metrics(gru_test_true, gru_test_pred)

  # Save the GRU test results
  gru_test_hidden_arr = np.concatenate(gru_test_hidden_collector, axis=0)
  np.save(os.path.join(config.gru_hiddens_dir, f"gru_hiddens_test_final.npy"), gru_test_hidden_arr)
  np.save(os.path.join(config.gru_results_dir, f"gru_test_labels_final.npy"), np.array(gru_test_true))
  np.save(os.path.join(config.gru_results_dir, f"gru_test_preds_final.npy"), np.array(gru_test_pred))

  print(f"\n[GRU Test] loss={gru_test_loss:.4f}, AUC={gru_test_metrics['auc']:.4f}, "
        f"PR-AUC={gru_test_metrics['prauc']:.4f}, ACC={gru_test_metrics['acc']:.4f}, "
        f"Precision={gru_test_metrics['precision']:.4f}, Recall={gru_test_metrics['recall']:.4f}, "
        f"F1={gru_test_metrics['f1']:.4f}")

  return gru_test_metrics['auc'], gru_test_metrics['prauc'], gru_test_metrics['acc'], gru_test_metrics['precision'], gru_test_metrics['recall'], gru_test_metrics['f1']

Executing Models

CONTENT

# Optional, confirm which params you are using
print(f"Embed Size: {config.embed_size}")
print(f"Hidden Size: {config.hidden_size}")
print(f"Batch Size: {config.batch_size}")
print(f"Learning Rate: {config.learning_rate}")
print(f"Num Topics: {config.n_topics}")
print(f"Num Epochs: {config.num_epochs}")
print(f"Num Trials: {config.num_trials}")
Embed Size: 100
Hidden Size: 200
Batch Size: 1
Learning Rate: 0.001
Num Topics: 50
Num Epochs: 6
Num Trials: 10
def run_content(trial, seed=None):
  print(f"-----TRIAL {trial}:-----")
  # Use new seed
  if seed is not None:
      torch.manual_seed(seed)
      np.random.seed(seed)

  # Brand‑new model & optimiser
  content_model, content_optimizer = init_content()

  # Optional, but part of original CONTENT: word2vec embedding initialization
  content_model = word2vec(content_model)

  # Train it
  content_model_losses = train_content(content_model, content_optimizer)

  # Optional: plot training loss
  # plot_content_loss(content_model_losses)

  # Evaluate and collect metrics
  metrics = test_content(content_model)
  print("------------------\n")
  return metrics

# Repeat
content_metrics = [run_content(i + 1, seed=i) for i in range(config.num_trials)]
content_aucs, content_praucs, content_accs, \
content_precisions, content_recalls, content_f1s = map(list, zip(*content_metrics))


print(
    "\n[CONTENT RESULTS OVER TRIALS] "
    f"AUC = {round(mean(content_aucs), 4)} +/- {round(stdev(content_aucs), 4)}, "
    f"PRAUC = {round(mean(content_praucs), 4)} +/- {round(stdev(content_praucs), 4)}, "
    f"ACC = {round(mean(content_accs), 4)} +/- {round(stdev(content_accs), 4)}, "
    f"PRECISION = {round(mean(content_precisions), 4)} +/- {round(stdev(content_precisions), 4)}, "
    f"RECALL = {round(mean(content_recalls), 4)} +/- {round(stdev(content_recalls), 4)}, "
    f"F1 = {round(mean(content_f1s), 4)} +/- {round(stdev(content_f1s), 4)}"
)
-----TRIAL 1:-----
Training CONTENT model:

Epoch 1/6 took 23.83s
  [Train] loss=0.4170

Epoch 2/6 took 30.39s
  [Train] loss=0.4002

Epoch 3/6 took 25.83s
  [Train] loss=0.3916

Epoch 4/6 took 28.47s
  [Train] loss=0.3801

Epoch 5/6 took 26.46s
  [Train] loss=0.3663

Epoch 6/6 took 27.75s
  [Train] loss=0.3507

[CONTENT Test] loss=0.3996, AUC=0.7932, PR-AUC=0.6387, ACC=0.8353, Precision=0.7318, Recall=0.4169, F1=0.5312
------------------

-----TRIAL 2:-----
Training CONTENT model:

Epoch 1/6 took 23.10s
  [Train] loss=0.4167

Epoch 2/6 took 26.40s
  [Train] loss=0.4010

Epoch 3/6 took 22.44s
  [Train] loss=0.3908

Epoch 4/6 took 21.56s
  [Train] loss=0.3812

Epoch 5/6 took 27.95s
  [Train] loss=0.3669

Epoch 6/6 took 25.63s
  [Train] loss=0.3515

[CONTENT Test] loss=0.4081, AUC=0.7911, PR-AUC=0.6350, ACC=0.8344, Precision=0.7915, Recall=0.3534, F1=0.4886
------------------

-----TRIAL 3:-----
Training CONTENT model:

Epoch 1/6 took 31.39s
  [Train] loss=0.4162

Epoch 2/6 took 28.93s
  [Train] loss=0.4004

Epoch 3/6 took 26.25s
  [Train] loss=0.3916

Epoch 4/6 took 22.70s
  [Train] loss=0.3804

Epoch 5/6 took 21.61s
  [Train] loss=0.3668

Epoch 6/6 took 22.42s
  [Train] loss=0.3515

[CONTENT Test] loss=0.4061, AUC=0.7912, PR-AUC=0.6356, ACC=0.8316, Precision=0.6924, Recall=0.4455, F1=0.5421
------------------

-----TRIAL 4:-----
Training CONTENT model:

Epoch 1/6 took 22.17s
  [Train] loss=0.4166

Epoch 2/6 took 25.83s
  [Train] loss=0.4010

Epoch 3/6 took 22.90s
  [Train] loss=0.3916

Epoch 4/6 took 21.58s
  [Train] loss=0.3819

Epoch 5/6 took 22.48s
  [Train] loss=0.3675

Epoch 6/6 took 22.46s
  [Train] loss=0.3525

[CONTENT Test] loss=0.4100, AUC=0.7932, PR-AUC=0.6377, ACC=0.8269, Precision=0.6504, Recall=0.4894, F1=0.5585
------------------

-----TRIAL 5:-----
Training CONTENT model:

Epoch 1/6 took 22.08s
  [Train] loss=0.4170

Epoch 2/6 took 21.50s
  [Train] loss=0.4011

Epoch 3/6 took 22.23s
  [Train] loss=0.3913

Epoch 4/6 took 22.36s
  [Train] loss=0.3815

Epoch 5/6 took 22.21s
  [Train] loss=0.3679

Epoch 6/6 took 21.83s
  [Train] loss=0.3529

[CONTENT Test] loss=0.4073, AUC=0.7944, PR-AUC=0.6411, ACC=0.8297, Precision=0.6688, Recall=0.4735, F1=0.5544
------------------

-----TRIAL 6:-----
Training CONTENT model:

Epoch 1/6 took 21.87s
  [Train] loss=0.4166

Epoch 2/6 took 22.28s
  [Train] loss=0.4003

Epoch 3/6 took 22.25s
  [Train] loss=0.3916

Epoch 4/6 took 21.77s
  [Train] loss=0.3804

Epoch 5/6 took 22.29s
  [Train] loss=0.3672

Epoch 6/6 took 22.40s
  [Train] loss=0.3518

[CONTENT Test] loss=0.4027, AUC=0.7925, PR-AUC=0.6369, ACC=0.8348, Precision=0.7276, Recall=0.4183, F1=0.5312
------------------

-----TRIAL 7:-----
Training CONTENT model:

Epoch 1/6 took 22.19s
  [Train] loss=0.4181

Epoch 2/6 took 21.43s
  [Train] loss=0.4000

Epoch 3/6 took 22.28s
  [Train] loss=0.3910

Epoch 4/6 took 22.37s
  [Train] loss=0.3807

Epoch 5/6 took 21.79s
  [Train] loss=0.3676

Epoch 6/6 took 22.21s
  [Train] loss=0.3504

[CONTENT Test] loss=0.4051, AUC=0.7937, PR-AUC=0.6378, ACC=0.8317, Precision=0.6849, Recall=0.4595, F1=0.5500
------------------

-----TRIAL 8:-----
Training CONTENT model:

Epoch 1/6 took 22.10s
  [Train] loss=0.4171

Epoch 2/6 took 22.26s
  [Train] loss=0.4001

Epoch 3/6 took 21.78s
  [Train] loss=0.3914

Epoch 4/6 took 22.00s
  [Train] loss=0.3799

Epoch 5/6 took 22.38s
  [Train] loss=0.3659

Epoch 6/6 took 22.61s
  [Train] loss=0.3496

[CONTENT Test] loss=0.4092, AUC=0.7902, PR-AUC=0.6312, ACC=0.8313, Precision=0.7051, Recall=0.4233, F1=0.5290
------------------

-----TRIAL 9:-----
Training CONTENT model:

Epoch 1/6 took 21.84s
  [Train] loss=0.4165

Epoch 2/6 took 21.82s
  [Train] loss=0.4001

Epoch 3/6 took 22.27s
  [Train] loss=0.3918

Epoch 4/6 took 22.41s
  [Train] loss=0.3821

Epoch 5/6 took 21.64s
  [Train] loss=0.3695

Epoch 6/6 took 22.35s
  [Train] loss=0.3547

[CONTENT Test] loss=0.4087, AUC=0.7935, PR-AUC=0.6367, ACC=0.8309, Precision=0.6759, Recall=0.4693, F1=0.5540
------------------

-----TRIAL 10:-----
Training CONTENT model:

Epoch 1/6 took 22.08s
  [Train] loss=0.4174

Epoch 2/6 took 22.25s
  [Train] loss=0.4001

Epoch 3/6 took 21.49s
  [Train] loss=0.3922

Epoch 4/6 took 22.24s
  [Train] loss=0.3803

Epoch 5/6 took 22.45s
  [Train] loss=0.3676

Epoch 6/6 took 22.21s
  [Train] loss=0.3530

[CONTENT Test] loss=0.3995, AUC=0.7961, PR-AUC=0.6377, ACC=0.8348, Precision=0.7292, Recall=0.4163, F1=0.5300
------------------


[CONTENT RESULTS OVER TRIALS] AUC = 0.7929 +/- 0.0017, PRAUC = 0.6368 +/- 0.0026, ACC = 0.8321 +/- 0.0027, PRECISION = 0.7058 +/- 0.0408, RECALL = 0.4365 +/- 0.0395, F1 = 0.5369 +/- 0.0205

GRU

def run_gru(trial, seed=None):
  print(f"-----TRIAL {trial}:-----")
  # Use new seed
  if seed is not None:
      torch.manual_seed(seed)
      np.random.seed(seed)

  # Brand‑new model & optimiser
  gru_model, gru_optimizer = init_gru()

  # Train it
  gru_model_losses = train_gru(gru_model, gru_optimizer)

  # Optional: plot training loss
  # plot_gru_loss(gru_model_losses)

  # Evaluate and collect metrics
  metrics = test_gru(gru_model)
  print("------------------\n")
  return metrics

# Repeat
gru_metrics = [run_gru(i + 1, seed=i) for i in range(config.num_trials)]
gru_aucs, gru_praucs, gru_accs, \
gru_precisions, gru_recalls, gru_f1s = map(list, zip(*gru_metrics))

print(
    "\n[GRU RESULTS OVER TRIALS] "
    f"AUC = {round(mean(gru_aucs), 4)} +/- {round(stdev(gru_aucs), 4)}, "
    f"PRAUC = {round(mean(gru_praucs), 4)} +/- {round(stdev(gru_praucs), 4)}, "
    f"ACC = {round(mean(gru_accs), 4)} +/- {round(stdev(gru_accs), 4)}, "
    f"PRECISION = {round(mean(gru_precisions), 4)} +/- {round(stdev(gru_precisions), 4)}, "
    f"RECALL = {round(mean(gru_recalls), 4)} +/- {round(stdev(gru_recalls), 4)}, "
    f"F1 = {round(mean(gru_f1s), 4)} +/- {round(stdev(gru_f1s), 4)}"
)
-----TRIAL 1:-----
Training GRU model:

Epoch 1/6 took 18.50s
  [Train] loss=0.4286

Epoch 2/6 took 17.94s
  [Train] loss=0.4017

Epoch 3/6 took 18.56s
  [Train] loss=0.3956

Epoch 4/6 took 17.78s
  [Train] loss=0.3872

Epoch 5/6 took 18.11s
  [Train] loss=0.3772

Epoch 6/6 took 18.35s
  [Train] loss=0.3647

[GRU Test] loss=0.4045, AUC=0.7915, PR-AUC=0.6337, ACC=0.8332, Precision=0.7243, Recall=0.4116, F1=0.5249
------------------

-----TRIAL 2:-----
Training GRU model:

Epoch 1/6 took 17.76s
  [Train] loss=0.4273

Epoch 2/6 took 17.58s
  [Train] loss=0.4019

Epoch 3/6 took 17.37s
  [Train] loss=0.3953

Epoch 4/6 took 18.07s
  [Train] loss=0.3875

Epoch 5/6 took 17.46s
  [Train] loss=0.3776

Epoch 6/6 took 17.37s
  [Train] loss=0.3656

[GRU Test] loss=0.4041, AUC=0.7891, PR-AUC=0.6341, ACC=0.8336, Precision=0.7127, Recall=0.4296, F1=0.5361
------------------

-----TRIAL 3:-----
Training GRU model:

Epoch 1/6 took 17.71s
  [Train] loss=0.4285

Epoch 2/6 took 17.35s
  [Train] loss=0.4023

Epoch 3/6 took 18.06s
  [Train] loss=0.3951

Epoch 4/6 took 17.43s
  [Train] loss=0.3869

Epoch 5/6 took 17.46s
  [Train] loss=0.3764

Epoch 6/6 took 18.15s
  [Train] loss=0.3641

[GRU Test] loss=0.4093, AUC=0.7873, PR-AUC=0.6255, ACC=0.8291, Precision=0.6888, Recall=0.4310, F1=0.5302
------------------

-----TRIAL 4:-----
Training GRU model:

Epoch 1/6 took 17.27s
  [Train] loss=0.4280

Epoch 2/6 took 17.96s
  [Train] loss=0.4021

Epoch 3/6 took 17.42s
  [Train] loss=0.3963

Epoch 4/6 took 17.30s
  [Train] loss=0.3881

Epoch 5/6 took 18.08s
  [Train] loss=0.3773

Epoch 6/6 took 17.31s
  [Train] loss=0.3662

[GRU Test] loss=0.4101, AUC=0.7841, PR-AUC=0.6240, ACC=0.8303, Precision=0.7027, Recall=0.4195, F1=0.5253
------------------

-----TRIAL 5:-----
Training GRU model:

Epoch 1/6 took 17.77s
  [Train] loss=0.4272

Epoch 2/6 took 17.58s
  [Train] loss=0.4021

Epoch 3/6 took 17.36s
  [Train] loss=0.3970

Epoch 4/6 took 18.10s
  [Train] loss=0.3880

Epoch 5/6 took 17.37s
  [Train] loss=0.3780

Epoch 6/6 took 17.28s
  [Train] loss=0.3673

[GRU Test] loss=0.4095, AUC=0.7888, PR-AUC=0.6329, ACC=0.8299, Precision=0.6752, Recall=0.4623, F1=0.5488
------------------

-----TRIAL 6:-----
Training GRU model:

Epoch 1/6 took 17.66s
  [Train] loss=0.4274

Epoch 2/6 took 17.28s
  [Train] loss=0.4012

Epoch 3/6 took 18.05s
  [Train] loss=0.3953

Epoch 4/6 took 17.47s
  [Train] loss=0.3869

Epoch 5/6 took 17.35s
  [Train] loss=0.3760

Epoch 6/6 took 18.17s
  [Train] loss=0.3645

[GRU Test] loss=0.4073, AUC=0.7859, PR-AUC=0.6259, ACC=0.8302, Precision=0.6967, Recall=0.4271, F1=0.5296
------------------

-----TRIAL 7:-----
Training GRU model:

Epoch 1/6 took 17.28s
  [Train] loss=0.4259

Epoch 2/6 took 17.97s
  [Train] loss=0.4026

Epoch 3/6 took 17.52s
  [Train] loss=0.3961

Epoch 4/6 took 17.30s
  [Train] loss=0.3885

Epoch 5/6 took 18.16s
  [Train] loss=0.3802

Epoch 6/6 took 17.33s
  [Train] loss=0.3691

[GRU Test] loss=0.4006, AUC=0.7898, PR-AUC=0.6341, ACC=0.8345, Precision=0.7479, Recall=0.3931, F1=0.5153
------------------

-----TRIAL 8:-----
Training GRU model:

Epoch 1/6 took 17.68s
  [Train] loss=0.4286

Epoch 2/6 took 17.59s
  [Train] loss=0.4027

Epoch 3/6 took 17.36s
  [Train] loss=0.3962

Epoch 4/6 took 18.10s
  [Train] loss=0.3868

Epoch 5/6 took 17.40s
  [Train] loss=0.3768

Epoch 6/6 took 17.31s
  [Train] loss=0.3641

[GRU Test] loss=0.4044, AUC=0.7878, PR-AUC=0.6287, ACC=0.8325, Precision=0.7207, Recall=0.4106, F1=0.5231
------------------

-----TRIAL 9:-----
Training GRU model:

Epoch 1/6 took 17.70s
  [Train] loss=0.4271

Epoch 2/6 took 17.23s
  [Train] loss=0.4023

Epoch 3/6 took 18.28s
  [Train] loss=0.3950

Epoch 4/6 took 17.43s
  [Train] loss=0.3871

Epoch 5/6 took 17.51s
  [Train] loss=0.3765

Epoch 6/6 took 18.33s
  [Train] loss=0.3635

[GRU Test] loss=0.4108, AUC=0.7880, PR-AUC=0.6289, ACC=0.8266, Precision=0.6635, Recall=0.4565, F1=0.5409
------------------

-----TRIAL 10:-----
Training GRU model:

Epoch 1/6 took 17.29s
  [Train] loss=0.4293

Epoch 2/6 took 18.06s
  [Train] loss=0.4019

Epoch 3/6 took 17.98s
  [Train] loss=0.3958

Epoch 4/6 took 17.50s
  [Train] loss=0.3885

Epoch 5/6 took 18.45s
  [Train] loss=0.3780

Epoch 6/6 took 17.46s
  [Train] loss=0.3665

[GRU Test] loss=0.4035, AUC=0.7868, PR-AUC=0.6270, ACC=0.8330, Precision=0.7459, Recall=0.3850, F1=0.5079
------------------


[GRU RESULTS OVER TRIALS] AUC = 0.7879 +/- 0.0021, PRAUC = 0.6295 +/- 0.0039, ACC = 0.8313 +/- 0.0025, PRECISION = 0.7079 +/- 0.028, RECALL = 0.4226 +/- 0.0246, F1 = 0.5282 +/- 0.0119