import pandas as pd
import numpy as np
# set random seed
np.random.seed(42)
# set dataset size
num_rows = 100
# categorical data
industry_options = ['Software', 'IT Services', 'Government (US)', 'Government (International)', 'Financial Services', 'Insurance', 'Telecommunication Services', 'Professional Services', 'Energy']
country_options = ['United States', 'United Kingdom', 'India', 'Canada', 'Australia', 'Germany']
source_options = ['Direct Traffic', 'Offline Sources', 'Email Marketing', 'Organic Search', 'Referrals']
relationship_options = ['Active Customer', 'Former Customer', 'Active Customer; Active Integration', 'Active OEM Partner', 'Former Customer; Active OEM Partner']
company_name_options = ['Company A', 'Company B', 'Company C', 'Company D', 'Company E']
# generate data for columns
data_dict = {
'Record ID': np.random.randint(10**6, 10**7, size=num_rows),
'Company name': np.random.choice(company_name_options, num_rows),
'Industry': np.random.choice(industry_options, num_rows),
'Country/Region': np.random.choice(country_options, num_rows),
'Relationship Type': np.random.choice(relationship_options, num_rows),
'Latest Source': np.random.choice(source_options, num_rows)
}
# generate random dates between 2019 and 2023
start_date = pd.to_datetime('2019-01-01')
end_date_range = pd.to_datetime('2023-07-14')
random_days = np.random.randint(0, (end_date_range - start_date).days, size=num_rows)
commence_dates = start_date + pd.to_timedelta(random_days, unit='d')
# engagement ends 1 to 5 years after commence
end_dates = commence_dates + pd.to_timedelta(np.random.randint(365, 365 * 5, size=num_rows), unit='d')
# signup date can be before/after commence or missing
signup_deltas = pd.to_timedelta(np.random.randint(-365, 365, size=num_rows), unit='d')
signup_dates = commence_dates + signup_deltas
# convert to numpy array to insert NaT values
signup_dates_array = signup_dates.to_numpy()
# make ~15% of signup dates missing
missing_mask = np.random.choice([True, False], size=num_rows, p=[0.15, 0.85])
signup_dates_array[missing_mask] = np.datetime64('NaT')
data_dict['Engagement Commence'] = commence_dates
data_dict['Engagement End'] = end_dates
data_dict['First GreyNoise Account Signup'] = signup_dates_array
# generate ARR as a string with commas
arr_values = np.random.randint(5000, 250000, num_rows)
data_dict['ARR'] = [f"{x:,}" for x in arr_values]
# create final dataframe
data = pd.DataFrame(data_dict)
# convert date columns to string in 'YYYY-MM-DD' format to match original raw data
for col in ['Engagement Commence', 'Engagement End', 'First GreyNoise Account Signup']:
data[col] = pd.to_datetime(data[col]).dt.strftime('%Y-%m-%d').replace({pd.NaT: None})1. Setup and Data Loading
Note: The original dataset is proprietary, so I generate a synthetic dataset that reflects its structure for demonstration.
2. Exploratory Data Analysis
2.1 Initial Inspection
data.head()| Record ID | Company name | Industry | Country/Region | Relationship Type | Latest Source | Engagement Commence | Engagement End | First GreyNoise Account Signup | ARR | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7423388 | Company C | Telecommunication Services | United Kingdom | Active Customer; Active Integration | Direct Traffic | 2021-03-25 | 2024-05-03 | None | 108,677 |
| 1 | 7550634 | Company A | Government (International) | India | Active OEM Partner | Direct Traffic | 2022-03-18 | 2026-01-27 | 2021-03-22 | 72,779 |
| 2 | 5304572 | Company B | Energy | Germany | Former Customer | Direct Traffic | 2019-03-21 | 2022-12-11 | 2019-06-29 | 24,029 |
| 3 | 3234489 | Company B | Software | United Kingdom | Active Customer; Active Integration | Organic Search | 2021-06-04 | 2024-04-10 | 2022-01-28 | 64,201 |
| 4 | 8204212 | Company D | Professional Services | Germany | Former Customer; Active OEM Partner | Direct Traffic | 2022-05-21 | 2025-05-03 | 2023-04-02 | 37,265 |
data.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Record ID 100 non-null int64
1 Company name 100 non-null object
2 Industry 100 non-null object
3 Country/Region 100 non-null object
4 Relationship Type 100 non-null object
5 Latest Source 100 non-null object
6 Engagement Commence 100 non-null object
7 Engagement End 100 non-null object
8 First GreyNoise Account Signup 88 non-null object
9 ARR 100 non-null object
dtypes: int64(1), object(9)
memory usage: 7.9+ KB
Note: In the original dataset, all columns were parsed as strings. We’ll need to convert them to their appropriate data types later on.
data.isna().sum()| 0 | |
|---|---|
| Record ID | 0 |
| Company name | 0 |
| Industry | 0 |
| Country/Region | 0 |
| Relationship Type | 0 |
| Latest Source | 0 |
| Engagement Commence | 0 |
| Engagement End | 0 |
| First GreyNoise Account Signup | 12 |
| ARR | 0 |
There are 12 entries with missing values for a ‘First GreyNoise Account Signup’ date.
3. Data Preprocessing and Feature Engineering
3.1. Initial Cleaning and Type Conversion
I drop the columns ‘Record ID’ and ‘Company name’, which contain bookkeeping information and are not useful for modeling.
data = data.drop(columns=['Record ID', 'Company name'])Since all variables were parsed as strings, I convert them to:
- Date: ‘Engagement Commence’, ‘Engagement End’, ‘First GreyNoise Account Signup’
- Numeric: ‘ARR’
date_columns = ['Engagement Commence', 'Engagement End', 'First GreyNoise Account Signup']
numeric_columns = ['ARR']
# convert columns to datetime
for col in date_columns:
data[col] = pd.to_datetime(data[col], errors='coerce')
# remove commas from 'ARR'
data['ARR'] = data['ARR'].str.replace(',', '')
# convert columns to numeric
for col in numeric_columns:
data[col] = pd.to_numeric(data[col], errors='coerce')
# verify changes
data.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Industry 100 non-null object
1 Country/Region 100 non-null object
2 Relationship Type 100 non-null object
3 Latest Source 100 non-null object
4 Engagement Commence 100 non-null datetime64[ns]
5 Engagement End 100 non-null datetime64[ns]
6 First GreyNoise Account Signup 88 non-null datetime64[ns]
7 ARR 100 non-null int64
dtypes: datetime64[ns](3), int64(1), object(4)
memory usage: 6.4+ KB
3.2 Filtering
Contracts with commence dates before September 2019 were incomplete in the original dataset, so I drop them.
data = data[data['Engagement Commence'] >= '2019-09-01']I only include companies with engagement commencement on or before July 14, 2022, ensuring each had at least one full year to potentially churn by the analysis date (July 14, 2023).
data = data[data['Engagement Commence'] <= '2022-07-14']3.3 Feature Creation
I create the variable ‘Had Prior Account’ to indicate whether a customer signed up for a free GreyNoise account before the start of their paid engagement. This variable has three possible values:
- 1: the customer signed up for a free account before their engagement
- 0: the customer signed up after the engagement commenced
- -1: no signup date was recorded
conditions = [
data['First GreyNoise Account Signup'] < data['Engagement Commence'], # account before commence
data['First GreyNoise Account Signup'] >= data['Engagement Commence'], # no account
data['First GreyNoise Account Signup'].isnull() # unknown
]
outcomes = [1, 0, -1]
# create column based on conditions
data['Had Prior Account'] = np.select(conditions, outcomes, default=-1)‘Relationship Type’ contains various customer statuses. Any value that includes ‘Former Customer’ indicates a churned customer. I map these values to churn = 1, and all other values to churn = 0.
data['Churned'] = data['Relationship Type'].str.contains('Former Customer', na=False).astype(int)3.4 Handling Missing Values
First, I drop all columns irrelevant to analysis.
data = data.drop(columns=['Relationship Type', 'Engagement Commence', 'Engagement End', 'First GreyNoise Account Signup'])
data.head()| Industry | Country/Region | Latest Source | ARR | Had Prior Account | Churned | |
|---|---|---|---|---|---|---|
| 0 | Telecommunication Services | United Kingdom | Direct Traffic | 108677 | -1 | 0 |
| 1 | Government (International) | India | Direct Traffic | 72779 | 1 | 0 |
| 3 | Software | United Kingdom | Organic Search | 64201 | 0 | 0 |
| 4 | Professional Services | Germany | Direct Traffic | 37265 | 0 | 1 |
| 6 | IT Services | India | Email Marketing | 248322 | 0 | 1 |
I check again for missing values across the dataset:
data.isna().sum()
data.dropna(inplace=True)I remove rows with missing values. Given their low count, this should have a minimal effect on the overall dataset.
4. Model Training and Validation
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
# 1. define features and target
X = data.drop('Churned', axis=1)
y = data['Churned']
# 2. define preprocessor
# define column types
categorical_features = ['Industry', 'Country/Region', 'Latest Source', 'Had Prior Account']
numeric_features = ['ARR']
# create preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
('num', StandardScaler(), numeric_features)
])
# create full pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter=1000))
])
# 3. estimate performance with stratified k-fold cv
# i use n = 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# metrics to calculate
scoring_metrics = ['accuracy', 'roc_auc', 'average_precision']
# cross_validate with the StratifiedKFold object
cv_results = cross_validate(pipeline, X, y, cv=skf, scoring=scoring_metrics)
# print estimates
print("--- Model Performance Estimate (Stratified 5-Fold CV) ---")
print(f"Mean Accuracy: {np.mean(cv_results['test_accuracy']):.2f}")
print(f"Mean ROC AUC: {np.mean(cv_results['test_roc_auc']):.2f}")
# 4. train final model on ALL data
final_model = pipeline.fit(X, y)--- Model Performance Estimate (Stratified 5-Fold CV) ---
Mean Accuracy: 0.51
Mean ROC AUC: 0.57
5. Model Interpretation
# get trained classifier from pipeline
classifier = pipeline.named_steps['classifier']
# get preprocessor and transformed feature names
preprocessor = pipeline.named_steps['preprocessor']
feature_names = preprocessor.get_feature_names_out()
# create df
importance_df = pd.DataFrame({
'Feature': feature_names,
'Coefficient': classifier.coef_[0]
})
# sort by the absolute value to see most impactful features
importance_df['Abs_Coefficient'] = importance_df['Coefficient'].abs()
importance_df = importance_df.sort_values(by='Abs_Coefficient', ascending=False)
importance_df| Feature | Coefficient | Abs_Coefficient | |
|---|---|---|---|
| 1 | cat__Industry_Financial Services | 1.425638 | 1.425638 |
| 19 | cat__Latest Source_Referrals | -1.014383 | 1.014383 |
| 10 | cat__Country/Region_Canada | 0.899791 | 0.899791 |
| 18 | cat__Latest Source_Organic Search | 0.627315 | 0.627315 |
| 3 | cat__Industry_Government (US) | -0.580932 | 0.580932 |
| 21 | cat__Had Prior Account_0 | 0.574032 | 0.574032 |
| 15 | cat__Latest Source_Direct Traffic | 0.562287 | 0.562287 |
| 22 | cat__Had Prior Account_1 | -0.550713 | 0.550713 |
| 2 | cat__Industry_Government (International) | -0.526404 | 0.526404 |
| 13 | cat__Country/Region_United Kingdom | -0.517882 | 0.517882 |
| 6 | cat__Industry_Professional Services | -0.508398 | 0.508398 |
| 12 | cat__Country/Region_India | 0.414341 | 0.414341 |
| 11 | cat__Country/Region_Germany | -0.408978 | 0.408978 |
| 5 | cat__Industry_Insurance | 0.316264 | 0.316264 |
| 7 | cat__Industry_Software | -0.261435 | 0.261435 |
| 9 | cat__Country/Region_Australia | -0.236989 | 0.236989 |
| 16 | cat__Latest Source_Email Marketing | -0.221798 | 0.221798 |
| 14 | cat__Country/Region_United States | -0.145851 | 0.145851 |
| 23 | num__ARR | 0.108085 | 0.108085 |
| 8 | cat__Industry_Telecommunication Services | 0.107558 | 0.107558 |
| 17 | cat__Latest Source_Offline Sources | 0.051013 | 0.051013 |
| 0 | cat__Industry_Energy | 0.031259 | 0.031259 |
| 20 | cat__Had Prior Account_-1 | -0.018886 | 0.018886 |
| 4 | cat__Industry_IT Services | 0.000883 | 0.000883 |
6. Final Model Export
import joblib
joblib.dump(final_model, 'gn_churn_model.joblib')['gn_churn_model.joblib']