New York City Airbnb Open Data Analysis

2022-07-28 631 words 3 minutes

Contents

New York City Airbnb Open Data Analysis

以下流程参考自 https://www.kaggle.com/code/chirag9073/airbnb-analysis-visualization-and-prediction

导入库

using MLJFlux, Flux, MLJ, DataFrames, CSV, StatsBase
using WordCloud

加载数据

origindata = CSV.read("data/newyork-city-airbnb-open-data/AB_NYC_2019.csv", DataFrame)

观察数据

你可以像教程那样

也可以，像我一样，用 excel 打开 csv 文件

我写了一个表格，记录我观察到的结果

column	missing count	type	type coerce	fill/drop
id	0	Int	Count => Continuous	None
name	16	String?	Multiclass	Drop
host_id	0	Int	Count => Continuous	None
host_name	21	String?	Multiclass	Drop
neighbourhood_group	0	String15	Multiclass => Count => Continuous	None
neighbourhood	0	String31	Multiclass => Count => Continuous	None
latitude	0	Float64	Continuous	None
longitude	0	Float64	Continuous => Multiclass => Continuous	None
room_type	0	String15	Multiclass => Count => Continuous	None
price	0	Int	Count => Continuous	None
minimum_nights	0	Int	Count => Continuous	None
number_of_reviews	0	Int	Count => Continuous	None
last_review	10052	Date?	Date => Count => Continuous ?	Drop
reviews_per_month	10052	Float64?	Continuous	Drop
calculated_host_listings_count	0	Int	Count => Continuous	None
availability_365	0	Int	Count => Continuous	None

你可以用这段代码来观察 missing 的数据量

for column in names(origindata)
  _count = count(ismissing, origindata[!, column])
  println("$column: missing $_count data")
end

数据清洗

基于上述数据观察，我们这样确定清洗流程，首先我们选择抛弃的特征

featureSelector = FeatureSelector(
  features = [:id, :name, :host_name, :last_review],
  ignore = true
)

:last_review 字段已被抛弃，有相似的字段 :reviews_per_month 存在过多缺失值，这里决定丢弃缺失的行

dropMissing(dataframe::DataFrame) = begin
  dropmissing(dataframe, :reviews_per_month)
end

:longitude 字段我们发现，他的数值在 -74, -75 上下，我们把他记为 1 和 2

processLongitude(dataframe::DataFrame) = begin
  dataframe[!, :longitude] = map(floor, dataframe[!, :longitude])
  array = unique(dataframe[!, :longitude])
  dict = Dict{Float64, Float64}()
  for (index, value) in Iterators.enumerate(array)
    dict[value] = index
  end

  dataframe[!, :longitude] = map(x -> dict[x], dataframe[!, :longitude])
  return dataframe
end

:neighbourhood_group 字段有多个重复的值，我们将其进行编码

processNeighbourhoodGroup(dataframe::DataFrame) = begin
  array = unique(dataframe[!, :neighbourhood_group])
  dict = Dict{String, Int}()
  for (index, value) in Iterators.enumerate(array)
    dict[value] = index
  end

  dataframe[!, :neighbourhood_group] = map(x -> dict[x], dataframe[!, :neighbourhood_group])

  return dataframe
end

:neighbourhood 和 :room_type 也是类似的

processNeighbourhood(dataframe::DataFrame) = begin
  array = unique(dataframe[!, :neighbourhood])

  dict = Dict{String, Int}()
  for (index, value) in Iterators.enumerate(array)
    dict[value] = index
  end

  dataframe[!, :neighbourhood] = map(x -> dict[x], dataframe[!, :neighbourhood])

  return dataframe
end

processRoomType(dataframe::DataFrame) = begin
  array = unique(dataframe[!, :room_type])
  dict = Dict{String, Int}()
  for (index, value) in Iterators.enumerate(array)
    dict[value] = index
  end

  dataframe[!, :room_type] = map(x -> dict[x], dataframe[!, :room_type])

  return dataframe
end

别忘了将科学类型 Count 改为科学类型 Continuous

coerceCount(dataframe::DataFrame) = begin
  coerce(dataframe, Count => Continuous)
end

最后转换数据

transformModel = Pipeline(
  featureSelector,
  dropMissing,
  processLongitude,
  processNeighbourhoodGroup,
  processNeighbourhood,
  processRoomType,
  coerceCount
)

transformMachine = machine(transformModel, origindata)
fit!(transformMachine)
transformedData = MLJ.transform(transformMachine, origindata)

数据可视化

别忘了导入库和设置 plot 后端

using Plots, StatsPlots
plotly()

Plotting all neighbourhood group

let 
  counts = countmap(origindata[!, :neighbourhood_group])
  bar(collect(keys(counts)), collect(values(counts)),
      title = "Neighbourhood Group") |> display
end

Plotting neighbourhood

let
  counts = countmap(origindata[!, :neighbourhood])
  bar(collect(keys(counts)), collect(values(counts)),
      xrotation = -90,
      xticks = :all,
      size = (1920, 1680),
      title = "Neighbourhood") |> display
end

Plotting room type

let 
  counts = countmap(origindata[!, :room_type])
  bar(collect(keys(counts)), collect(values(counts))) |> display
end

Plotting relation between neighbourhood_group and availability_365 of room

let
  x = origindata[!, :neighbourhood_group]
  y = origindata[!, :availability_365]
  boxplot(x, y) |> display
end

Plotting map of neighbourhood_group

let
  array = unique(origindata[!, :neighbourhood_group])
  colors = [:red, :green, :blue, :black, :yellow]
  dict = Dict{String, Symbol}()

  for (index, value) in Iterators.enumerate(array)
    dict[value] = colors[index]
  end

  markercolors = map(x -> dict[x], origindata[!, :neighbourhood_group])
  scatter(origindata[!, :longitude], origindata[!, :latitude],
	  markercolor = markercolors,
	  size = figuresize) |> display
end

Plotting map of neighbourhood

let
  array = unique(origindata[!, :room_type])
  colors = [:red, :green, :blue]
  dict = Dict{String, Symbol}()
  for (index, value) in Iterators.enumerate(array)
    dict[value] = colors[index]
  end

  markercolors = map(x -> dict[x], origindata[!, :room_type])
  scatter(origindata[!, :longitude], origindata[!, :latitude],
	  markercolor = markercolors,
	  size = (1980, 1600)) |> display
end

Plotting availability of room

let
  mapcolor(number::Number) = begin
    if number >= 0 && number < 150
      return :red
    elseif number >= 150 && number < 300
      return :green
    elseif number >= 300 && number < 450
      return :blue
    else
      return :black
    end
  end

  markercolors = map(mapcolor, origindata[!, :availability_365])
  scatter(origindata[!, :longitude], origindata[!, :latitude],
	  markercolor = markercolors,
	  size = figuresize |> display
end

Word Cloud

using WordCloud
wc = wordcloud(origindata[!, :neighbourhood]) |> generate!
paint(wc, "/home/steiner/Downloads/neighbourhood.png")