RFM analysis with Python

avenger_wang / data / / 0 Comments
Table of Content
#load data
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
import datetime as dt
df=pd.read_csv('data.csv',encoding='latin1')
df.head()
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER612/1/2010 8:262.5517850.0United Kingdom
153636571053WHITE METAL LANTERN612/1/2010 8:263.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER812/1/2010 8:262.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE612/1/2010 8:263.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.612/1/2010 8:263.3917850.0United Kingdom
df.dtypes
InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object
#transfer data type
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
df['CustomerID']=df['CustomerID'].astype('object')
df.dtypes
InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object
#create time diff
R_today = dt.datetime(2021,8,7)
df['R_diff'] = (R_today - df['InvoiceDate']).dt.days
df.head(3)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryR_diff
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850United Kingdom3901
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850United Kingdom3901
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850United Kingdom3901
#create R,F,M
R = df.groupby(by=['CustomerID'])['R_diff'].agg([('R','min')])
M=df.groupby(by=['CustomerID'])['UnitPrice'].agg([('M','sum')])
#drop duplicates and calculate the recency
F_undone=df.drop_duplicates(subset='InvoiceNo')
F_undone.head(3)
F=F_undone.groupby(by=['CustomerID'])['CustomerID'].agg([('F','count')])
RFM = R.join(F).join(M)
RFM.head(3)
RFM
CustomerID
12346.0385322.08
12347.035307481.21
12348.036034178.71
RFM.describe()
RFM
count4372.0000004372.0000004372.000000
mean3619.5811995.075480322.008226
std100.7721399.3387541284.783098
min3528.0000001.0000000.000000
25%3544.0000001.00000052.865000
50%3578.0000003.000000130.095000
75%3671.0000005.000000302.332500
max3901.000000248.00000041376.330000
#define R
RFM['R_revalue']=RFM['R']/365
RFM.head(5)
RFMR_revalue
CustomerID
12346.0385322.0810.556164
12347.035307481.219.671233
12348.036034178.719.871233
12349.035461605.109.715068
12350.03838165.3010.515068
#weight given: R(0.1)/F(0.5)/M(0.4)
RFM['value']=RFM['R_revalue']*0.1+RFM['F']*0.5+RFM['M']*0.4
RFM.head(3)
RFMR_revaluevalue
CustomerID
12346.0385322.0810.5561642.887616
12347.035307481.219.671233196.951123
12348.036034178.719.87123374.471123
RFM['value'].describe()
#most of the value under 124
#then we check the distribution of value which under 150
count     4372.000000
mean       132.332697
std        516.430006
min          1.470411
25%         22.965753
50%         54.611945
75%        124.896836
max      16568.499671
Name: value, dtype: float64
RFM_Common=RFM[RFM['value']<150]
sns.set()
sns.distplot(RFM_Common['value'],bins=8)

RFM_Common['value'].std()
38.006808099240864
RFM_Common['value'].mean()
49.523783395743465
#1.96 std contains 95% data, so the top limit should be 136
38+1.96 * 50
136.0
#set four levels of customers, 'level 4' are most valuable 
bins=[0,25,50,136,16569]
RFM['type']=pd.cut(RFM['value'],bins,labels=['level 1','level 2','level 3','level 4'])
RFM['type'].value_counts().plot(kind='pie',autopct='%1.2f%%',subplots=True)
array([],
      dtype=object)

#export customer list
df.to_csv('rfm.csv')

Leave a Reply