import pandas as pd
import numpy as np
# set random seed
42)
np.random.seed(
# set dataset size
= 100
num_rows
# categorical data
= ['Software', 'IT Services', 'Government (US)', 'Government (International)', 'Financial Services', 'Insurance', 'Telecommunication Services', 'Professional Services', 'Energy']
industry_options = ['United States', 'United Kingdom', 'India', 'Canada', 'Australia', 'Germany']
country_options = ['Direct Traffic', 'Offline Sources', 'Email Marketing', 'Organic Search', 'Referrals']
source_options = ['Active Customer', 'Former Customer', 'Active Customer; Active Integration', 'Active OEM Partner', 'Former Customer; Active OEM Partner']
relationship_options = ['Company A', 'Company B', 'Company C', 'Company D', 'Company E']
company_name_options
# generate data for columns
= {
data_dict 'Record ID': np.random.randint(10**6, 10**7, size=num_rows),
'Company name': np.random.choice(company_name_options, num_rows),
'Industry': np.random.choice(industry_options, num_rows),
'Country/Region': np.random.choice(country_options, num_rows),
'Relationship Type': np.random.choice(relationship_options, num_rows),
'Latest Source': np.random.choice(source_options, num_rows)
}
# generate random dates between 2019 and 2023
= pd.to_datetime('2019-01-01')
start_date = pd.to_datetime('2023-07-14')
end_date_range = np.random.randint(0, (end_date_range - start_date).days, size=num_rows)
random_days = start_date + pd.to_timedelta(random_days, unit='d')
commence_dates
# engagement ends 1 to 5 years after commence
= commence_dates + pd.to_timedelta(np.random.randint(365, 365 * 5, size=num_rows), unit='d')
end_dates
# signup date can be before/after commence or missing
= pd.to_timedelta(np.random.randint(-365, 365, size=num_rows), unit='d')
signup_deltas = commence_dates + signup_deltas
signup_dates
# convert to numpy array to insert NaT values
= signup_dates.to_numpy()
signup_dates_array
# make ~15% of signup dates missing
= np.random.choice([True, False], size=num_rows, p=[0.15, 0.85])
missing_mask = np.datetime64('NaT')
signup_dates_array[missing_mask]
'Engagement Commence'] = commence_dates
data_dict['Engagement End'] = end_dates
data_dict['First GreyNoise Account Signup'] = signup_dates_array
data_dict[
# generate ARR as a string with commas
= np.random.randint(5000, 250000, num_rows)
arr_values 'ARR'] = [f"{x:,}" for x in arr_values]
data_dict[
# create final dataframe
= pd.DataFrame(data_dict)
data
# convert date columns to string in 'YYYY-MM-DD' format to match original raw data
for col in ['Engagement Commence', 'Engagement End', 'First GreyNoise Account Signup']:
= pd.to_datetime(data[col]).dt.strftime('%Y-%m-%d').replace({pd.NaT: None}) data[col]
1. Setup and Data Loading
Note: The original dataset is proprietary, so I generate a synthetic dataset that reflects its structure for demonstration.
2. Exploratory Data Analysis
2.1 Initial Inspection
data.head()
Record ID | Company name | Industry | Country/Region | Relationship Type | Latest Source | Engagement Commence | Engagement End | First GreyNoise Account Signup | ARR | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 7423388 | Company C | Telecommunication Services | United Kingdom | Active Customer; Active Integration | Direct Traffic | 2021-03-25 | 2024-05-03 | None | 108,677 |
1 | 7550634 | Company A | Government (International) | India | Active OEM Partner | Direct Traffic | 2022-03-18 | 2026-01-27 | 2021-03-22 | 72,779 |
2 | 5304572 | Company B | Energy | Germany | Former Customer | Direct Traffic | 2019-03-21 | 2022-12-11 | 2019-06-29 | 24,029 |
3 | 3234489 | Company B | Software | United Kingdom | Active Customer; Active Integration | Organic Search | 2021-06-04 | 2024-04-10 | 2022-01-28 | 64,201 |
4 | 8204212 | Company D | Professional Services | Germany | Former Customer; Active OEM Partner | Direct Traffic | 2022-05-21 | 2025-05-03 | 2023-04-02 | 37,265 |
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Record ID 100 non-null int64
1 Company name 100 non-null object
2 Industry 100 non-null object
3 Country/Region 100 non-null object
4 Relationship Type 100 non-null object
5 Latest Source 100 non-null object
6 Engagement Commence 100 non-null object
7 Engagement End 100 non-null object
8 First GreyNoise Account Signup 88 non-null object
9 ARR 100 non-null object
dtypes: int64(1), object(9)
memory usage: 7.9+ KB
Note: In the original dataset, all columns were parsed as strings. We’ll need to convert them to their appropriate data types later on.
sum() data.isna().
0 | |
---|---|
Record ID | 0 |
Company name | 0 |
Industry | 0 |
Country/Region | 0 |
Relationship Type | 0 |
Latest Source | 0 |
Engagement Commence | 0 |
Engagement End | 0 |
First GreyNoise Account Signup | 12 |
ARR | 0 |
There are 12 entries with missing values for a ‘First GreyNoise Account Signup’ date.
3. Data Preprocessing and Feature Engineering
3.1. Initial Cleaning and Type Conversion
I drop the columns ‘Record ID’ and ‘Company name’, which contain bookkeeping information and are not useful for modeling.
= data.drop(columns=['Record ID', 'Company name']) data
Since all variables were parsed as strings, I convert them to:
- Date: ‘Engagement Commence’, ‘Engagement End’, ‘First GreyNoise Account Signup’
- Numeric: ‘ARR’
= ['Engagement Commence', 'Engagement End', 'First GreyNoise Account Signup']
date_columns = ['ARR']
numeric_columns
# convert columns to datetime
for col in date_columns:
= pd.to_datetime(data[col], errors='coerce')
data[col]
# remove commas from 'ARR'
'ARR'] = data['ARR'].str.replace(',', '')
data[# convert columns to numeric
for col in numeric_columns:
= pd.to_numeric(data[col], errors='coerce')
data[col]
# verify changes
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Industry 100 non-null object
1 Country/Region 100 non-null object
2 Relationship Type 100 non-null object
3 Latest Source 100 non-null object
4 Engagement Commence 100 non-null datetime64[ns]
5 Engagement End 100 non-null datetime64[ns]
6 First GreyNoise Account Signup 88 non-null datetime64[ns]
7 ARR 100 non-null int64
dtypes: datetime64[ns](3), int64(1), object(4)
memory usage: 6.4+ KB
3.2 Filtering
Contracts with commence dates before September 2019 were incomplete in the original dataset, so I drop them.
= data[data['Engagement Commence'] >= '2019-09-01'] data
I only include companies with engagement commencement on or before July 14, 2022, ensuring each had at least one full year to potentially churn by the analysis date (July 14, 2023).
= data[data['Engagement Commence'] <= '2022-07-14'] data
3.3 Feature Creation
I create the variable ‘Had Prior Account’ to indicate whether a customer signed up for a free GreyNoise account before the start of their paid engagement. This variable has three possible values:
- 1: the customer signed up for a free account before their engagement
- 0: the customer signed up after the engagement commenced
- -1: no signup date was recorded
= [
conditions 'First GreyNoise Account Signup'] < data['Engagement Commence'], # account before commence
data['First GreyNoise Account Signup'] >= data['Engagement Commence'], # no account
data['First GreyNoise Account Signup'].isnull() # unknown
data[
]= [1, 0, -1]
outcomes
# create column based on conditions
'Had Prior Account'] = np.select(conditions, outcomes, default=-1) data[
‘Relationship Type’ contains various customer statuses. Any value that includes ‘Former Customer’ indicates a churned customer. I map these values to churn = 1
, and all other values to churn = 0
.
'Churned'] = data['Relationship Type'].str.contains('Former Customer', na=False).astype(int) data[
3.4 Handling Missing Values
First, I drop all columns irrelevant to analysis.
= data.drop(columns=['Relationship Type', 'Engagement Commence', 'Engagement End', 'First GreyNoise Account Signup'])
data data.head()
Industry | Country/Region | Latest Source | ARR | Had Prior Account | Churned | |
---|---|---|---|---|---|---|
0 | Telecommunication Services | United Kingdom | Direct Traffic | 108677 | -1 | 0 |
1 | Government (International) | India | Direct Traffic | 72779 | 1 | 0 |
3 | Software | United Kingdom | Organic Search | 64201 | 0 | 0 |
4 | Professional Services | Germany | Direct Traffic | 37265 | 0 | 1 |
6 | IT Services | India | Email Marketing | 248322 | 0 | 1 |
I check again for missing values across the dataset:
sum()
data.isna().=True) data.dropna(inplace
I remove rows with missing values. Given their low count, this should have a minimal effect on the overall dataset.
4. Model Training and Validation
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
# 1. define features and target
= data.drop('Churned', axis=1)
X = data['Churned']
y
# 2. define preprocessor
# define column types
= ['Industry', 'Country/Region', 'Latest Source', 'Had Prior Account']
categorical_features = ['ARR']
numeric_features
# create preprocessor with ColumnTransformer
= ColumnTransformer(
preprocessor =[
transformers'cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
('num', StandardScaler(), numeric_features)
(
])
# create full pipeline
= Pipeline(steps=[
pipeline 'preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter=1000))
(
])
# 3. estimate performance with stratified k-fold cv
# i use n = 5 splits
= StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf
# metrics to calculate
= ['accuracy', 'roc_auc', 'average_precision']
scoring_metrics
# cross_validate with the StratifiedKFold object
= cross_validate(pipeline, X, y, cv=skf, scoring=scoring_metrics)
cv_results
# print estimates
print("--- Model Performance Estimate (Stratified 5-Fold CV) ---")
print(f"Mean Accuracy: {np.mean(cv_results['test_accuracy']):.2f}")
print(f"Mean ROC AUC: {np.mean(cv_results['test_roc_auc']):.2f}")
# 4. train final model on ALL data
= pipeline.fit(X, y) final_model
--- Model Performance Estimate (Stratified 5-Fold CV) ---
Mean Accuracy: 0.51
Mean ROC AUC: 0.57
5. Model Interpretation
# get trained classifier from pipeline
= pipeline.named_steps['classifier']
classifier
# get preprocessor and transformed feature names
= pipeline.named_steps['preprocessor']
preprocessor = preprocessor.get_feature_names_out()
feature_names
# create df
= pd.DataFrame({
importance_df 'Feature': feature_names,
'Coefficient': classifier.coef_[0]
})
# sort by the absolute value to see most impactful features
'Abs_Coefficient'] = importance_df['Coefficient'].abs()
importance_df[= importance_df.sort_values(by='Abs_Coefficient', ascending=False)
importance_df
importance_df
Feature | Coefficient | Abs_Coefficient | |
---|---|---|---|
1 | cat__Industry_Financial Services | 1.425638 | 1.425638 |
19 | cat__Latest Source_Referrals | -1.014383 | 1.014383 |
10 | cat__Country/Region_Canada | 0.899791 | 0.899791 |
18 | cat__Latest Source_Organic Search | 0.627315 | 0.627315 |
3 | cat__Industry_Government (US) | -0.580932 | 0.580932 |
21 | cat__Had Prior Account_0 | 0.574032 | 0.574032 |
15 | cat__Latest Source_Direct Traffic | 0.562287 | 0.562287 |
22 | cat__Had Prior Account_1 | -0.550713 | 0.550713 |
2 | cat__Industry_Government (International) | -0.526404 | 0.526404 |
13 | cat__Country/Region_United Kingdom | -0.517882 | 0.517882 |
6 | cat__Industry_Professional Services | -0.508398 | 0.508398 |
12 | cat__Country/Region_India | 0.414341 | 0.414341 |
11 | cat__Country/Region_Germany | -0.408978 | 0.408978 |
5 | cat__Industry_Insurance | 0.316264 | 0.316264 |
7 | cat__Industry_Software | -0.261435 | 0.261435 |
9 | cat__Country/Region_Australia | -0.236989 | 0.236989 |
16 | cat__Latest Source_Email Marketing | -0.221798 | 0.221798 |
14 | cat__Country/Region_United States | -0.145851 | 0.145851 |
23 | num__ARR | 0.108085 | 0.108085 |
8 | cat__Industry_Telecommunication Services | 0.107558 | 0.107558 |
17 | cat__Latest Source_Offline Sources | 0.051013 | 0.051013 |
0 | cat__Industry_Energy | 0.031259 | 0.031259 |
20 | cat__Had Prior Account_-1 | -0.018886 | 0.018886 |
4 | cat__Industry_IT Services | 0.000883 | 0.000883 |
6. Final Model Export
import joblib
'gn_churn_model.joblib') joblib.dump(final_model,
['gn_churn_model.joblib']