#load data
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
import datetime as dt
df=pd.read_csv('data.csv',encoding='latin1')
#drop duplicates and calculate the recency
F_undone=df.drop_duplicates(subset='InvoiceNo')
F_undone.head(3)
F=F_undone.groupby(by=['CustomerID'])['CustomerID'].agg([('F','count')])
RFM = R.join(F).join(M)
RFM.head(3)
R
F
M
CustomerID
12346.0
3853
2
2.08
12347.0
3530
7
481.21
12348.0
3603
4
178.71
RFM.describe()
R
F
M
count
4372.000000
4372.000000
4372.000000
mean
3619.581199
5.075480
322.008226
std
100.772139
9.338754
1284.783098
min
3528.000000
1.000000
0.000000
25%
3544.000000
1.000000
52.865000
50%
3578.000000
3.000000
130.095000
75%
3671.000000
5.000000
302.332500
max
3901.000000
248.000000
41376.330000
#define R
RFM['R_revalue']=RFM['R']/365
RFM.head(5)
#1.96 std contains 95% data, so the top limit should be 136
38+1.96 * 50
136.0
#set four levels of customers, 'level 4' are most valuable
bins=[0,25,50,136,16569]
RFM['type']=pd.cut(RFM['value'],bins,labels=['level 1','level 2','level 3','level 4'])