电信用户流失分析
Contents
电信用户流失分析
说明
这次文章的参考来自 https://zhuanlan.zhihu.com/p/68397317
提出问题
关于用户留存有这样一个观点,如果将用户流失率降低5%,公司利润将提升25%-85% 如今高居不下的获客成本让电信运营商遭遇“天花板”,甚至陷入获客难的窘境 随着市场饱和度上升,电信运营商亟待解决增加用户黏性,延长用户生命周期的问题 因此,电信用户流失分析与预测至关重要。
数据集来自 kaggle
理解数据
字段名 | 数据类型 | 字段描述 |
---|---|---|
customerID | String | 顾客ID |
gender | String | 客户性别 |
SeniorCitizen | Integer | 客户是否为老年人 |
Partner | String | 客户是否有合作伙伴 |
Dependents | String | 客户是否有家属 |
tenure | Integer | 客户在公司停留的月数 |
PhoneService | String | 客户是否有电话服务 |
MultipleLines | String | 客户是否有多条路线 |
InternetService | String | 客户的互联网服务提供商 |
OnlineSecurity | String | 客户是否具有在线安全性 |
OnlineBackup | String | 客户是否有在线备份 |
DeviceProtection | String | 客户是否有设备保护 |
TechSupport | String | 客户是否有技术支持 |
StreamingTV | String | 客户是否有流媒体电视 |
StreamingMovies | String | 客户是否与流媒体电影 |
Contract | String | 客户的合同期限 |
PaperlessBilling | String | 客户是否有无纸化账单 |
PaymentMethod | String | 客户的支付方式 |
MonthlyCharges | Integer | 每月向客户收取的金额 |
TotalCharges | Integer | 向客户收取的总金额 |
Churn | String | 客户是否流失 |
数据清洗
首先导入数据
origindata = CSV.read("data/telco-customer-churn/data.csv", DataFrame)
查看数据类型,发现 TotalCharges
为字符串,把他改为浮点型数据,但是发现他有几个空的数据,是 " "
我们定义处理函数
function transformTotalCharges(dataframe::DataFrame)
indexs = dataframe[!, :TotalCharges] .== " "
dataframe[!, :TotalCharges][indexs] .= string.(dataframe[!, :MonthlyCharges][indexs])
dataframe[!, :TotalCharges] = map(x -> parse(Float64, x), dataframe[!, :TotalCharges])
dataframe[!, :tenure][indexs] .= 1
return dataframe
end
解释下最后几行代码, TotalCharges
为空的行中,其 tenure
入网时长为 0 个月,这样我们推测是当月新入用户
根据一般禁烟,用户即使在注册的当月流失,也需缴纳当月费用,因此将这种用户入网市场改为1,将总消费额填充为月消费额
可视化分析
首先我们查看流失用户和占比
function plotChurn(dataframe::DataFrame)
counts = countmap(dataframe[!, :Churn])
yescount = counts["Yes"]
nocount = counts["No"]
total = yescount + nocount
xs = ["Yes", "No"]
ys1 = [yescount / total, nocount / total]
ys2 = [yescount, nocount]
pie(xs, ys1, aspect_ratio = :equal) |> display
bar(xs, ys2, size = figuresize) |> display
end
plotChurn(origindata)
属于不平衡数据集,流失用户占比达 26.54%
用户属性分析
function plotPercentages(dataframe::DataFrame, feature::Symbol, ymatrix::Matrix{Float64})
columns = [feature, :Churn]
groupDataframe = groupby(select(dataframe, columns), feature)
xs = []
let
index = 1
for _dataframe in groupDataframe
x = first(_dataframe[!, feature])
push!(xs, x)
yescount = count(isequal("Yes"), _dataframe[!, :Churn])
nocount = count(isequal("No"), _dataframe[!, :Churn])
total = yescount + nocount
ymatrix[index, :] = [yescount / total, nocount / total]
index += 1
end
end
groupedbar(ymatrix, xticks = (1:length(xs), xs), label = ["Yes" "No"]) |> display
end
plotPercentages(origindata, :SeniorCitizen, ones((2, 2)))
plotPercentages(origindata, :gender, ones((2, 2)))
plotPercentages(origindata, :Partner, ones((2, 2)))
plotPercentages(origindata, :Dependents, ones((2, 2)))
density(origindata.tenure, group = origindata.Churn, size = figuresize) |> display
可以看出
- 用户流失与性别基本无关
- 年老用户流失占比显著高于年轻用户
服务属性分析
plotPercentages(origindata, :MultipleLines, ones((3, 2)))
plotPercentages(origindata, :InternetService, ones((3, 2)))
function plotPaperlessBillingChurn(dataframe::DataFrame)
columns = [:PaperlessBilling, :Contract, :Churn]
groupDataframe = groupby(select(dataframe, columns), :PaperlessBilling)
array = unique(dataframe[!, :Contract])
for _dataframe in groupDataframe
_dataframe = filter(row -> row.Churn == "Yes", _dataframe)
paperlessbilling = first(_dataframe[!, :PaperlessBilling])
churn1 = count(isequal(array[1]), _dataframe[!, :Contract])
churn2 = count(isequal(array[2]), _dataframe[!, :Contract])
churn3 = count(isequal(array[3]), _dataframe[!, :Contract])
total = churn1 + churn2 + churn3
ys = [churn1 / total, churn2 / total, churn3 / total]
bar(array, ys, title = "PaperlessBilling = $paperlessbilling") |> display
end
end
plotPaperlessBillingChurn(origindata)
function plotNumberOfCustomer(dataframe::DataFrame)
columns = ["PhoneService", "MultipleLines", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
ymatrix = ones((length(columns), 3))
index = 1
for column in columns
_dataframe = select(filter(row -> row.InternetService != "No", dataframe), [column, "Churn"])
count1 = count(isequal("Yes"), _dataframe[!, column])
count2 = count(isequal("No"), _dataframe[!, column])
if column != "MultipleLines"
ymatrix[index, :] = [count1, count2, 0]
else
ymatrix[index, :] = [count1, count2, count(isequal("No phone service"), _dataframe[!, column])]
end
index += 1
end
groupedbar(ymatrix, xticks = (1:length(columns), columns), label = ["Has Service" "No Service" "No Service"], size = figuresize) |> display
end
plotNumberOfCustomer(origindata)
function plotNumberOfChurnCustomer(dataframe::DataFrame)
columns = ["PhoneService", "MultipleLines", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
ymatrix = ones((length(columns), 2))
index = 1
for column in columns
_dataframe = select(filter(row -> row.InternetService != "No" && row.Churn == "Yes", dataframe), [column, "Churn"])
# has service but churn
yescount = count(isequal("Yes"), _dataframe[!, column])
# has no service but churn
nocount = count(isequal("No"), _dataframe[!, column])
ymatrix[index, :] = [yescount, nocount]
index += 1
end
groupedbar(ymatrix, xticks = (1:length(columns), columns), label = ["Has Service" "No Service"], size = figuresize) |> display
end
plotNumberOfChurnCustomer(origindata)
可以看出
- 电话服务整体对用户流失影响较小
- 单光纤用户的流失占比较高
- 光纤用户绑定了安全,备份,保护,技术支持服务的流失率较低
- 光纤用户附加流媒体电视,电影服务的流失率占比较高
合同属性分析
plotPercentages(origindata, :PaymentMethod, ones((4, 2)))
density(origindata.MonthlyCharges, group = origindata.Churn, size = figuresize) |> display
density(origindata.TotalCharges, group = origindata.Churn, size = figuresize) |> display
可以看出
- 采用电子支票支付的用户流失率最高,推测该方式的使用体验较为一般
- 签订合同方式对客户流失率影响为 按月签订 > 按一年签订 > 按两年签订,证明长期合同最能保留客户
- 月消费额大约在 70 - 110 之间用户流失率较高
- 长期来看,用户总消费越高,流失率月底,符合一般经验