Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 10

library(tidyverse)

library(dplyr)

library(lubridate)

library(Hmisc)

library(ggplot2)

library(plotly)

#import data into R

eu_storeData <- readxl::read_excel("Cohort-6/EU Superstore_org.xls")

world_cities = read.csv("Cohort-6/worldcities.csv")

#Added new columns and save dataframe into new variable, df_eu_storeData

eu_storeData |>

mutate(converted_date = as.Date(`Order Date`),

UnitPrice = (Sales/Quantity) / (1- Discount),

n_repeated_transactions = n_distinct(`Order ID`)/n_distinct(`Customer ID`))-> df_eu_storeData

na.zero <- function (x) {

x[is.na(x)] <- 0

return(x)

#Customer related data

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`) |>

summarise(

n_transactions = n_distinct(`Order ID`),

total_spent = sum(Sales),
profit_generated =sum(Profit),

date_first_transaction = min(converted_date),

date_last_transaction = max(converted_date),

n_shipmode_used = n_distinct(`Ship Mode`),

n_items_purchased = n_distinct(`Product ID`),

max_shipmode_used = max(`Ship Mode`),

c_sale_value = total_spent/n_transactions) |>

mutate(

recency = as.numeric(difftime(ymd('2019-01-31'), date_last_transaction)),

customer_retension_time_in_yrs = round((recency/30)/12,2)) -> c_data

left_join(df_eu_storeData, c_data) -> df_clv_data

#yearly sale for individual customer

df_eu_storeData$year <- format(df_eu_storeData$converted_date, "%Y")

df_eu_storeData$month <- format(df_eu_storeData$converted_date, "%Y-%M")

df_eu_storeData %>%

group_by(`Customer ID`,`Customer Name`, year) %>%

summarise(total_sales = sum(Sales)) -> sales_by_customer_year

#Customer Life time value

df_clv_data |>
mutate(

avg_retension_time = median(customer_retension_time_in_yrs, na.rm = TRUE),

avg_sale_value = median(c_sale_value, na.rm = TRUE),

clv = avg_retension_time * c_sale_value * n_repeated_transactions)-> clv_matrix

#Customer Discount availed

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`) |>

summarise(

p_discount = sum(Discount)) |>

left_join(

df_eu_storeData |>

filter(Discount > 0) |>

group_by(`Customer ID`, `Customer Name`) |>

summarise(

discount_availed = sum(Discount) * sum(UnitPrice*Quantity) - sum(Sales),

n_discounted_items_purchased = n_distinct(`Product ID`))

) -> df_discounted_customers

left_join(clv_matrix, df_discounted_customers) -> df_merge1

df_merge1 |>

mutate(

n_discounted_items_purchased = na.zero(n_discounted_items_purchased)

) ->df_merge1

#customer's favorite category in terms of most spending, quantity and n_times bought

df_merge1 |>

group_by(`Customer ID`, `Customer Name`, Category) |>

summarise(
spending= sum(Sales),

quantity = sum(Quantity),

n_items_purchased = n_distinct(`Product ID`),

n_times_bought = n_distinct(`Order ID`)) |>

mutate(

p_spending = round(spending/sum(spending),2),

most_catagory_purchased = max(quantity)) |>

ungroup() |>

arrange(`Customer ID`, desc(spending)) -> c_category_spent

#for individual customer most category purchased

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`, Category) |>

summarise(quantity = sum(Quantity)) |>

summarise(most_category_purchased = first(Category,

order_by = desc(quantity)) ) -> df_category_spent

left_join(c_category_spent, df_category_spent) -> df_merge_category

#left_join(df_merge1, df_merge_category) -> df_merge2

#NA occurs

#Customer Geographic behaviour

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`, City) |>


summarise(

spending= sum(Sales),

quantity = sum(Quantity),

n_times_bought = n_distinct(`Order ID`)) |>

mutate(

p_spending = round(spending/sum(spending),2)) |>

arrange(`Customer ID`, desc(spending)) -> customer_city_spent

#Country wise purchase

df_eu_storeData |>

group_by(Country) |>

summarise(

spending= sum(Sales),

profit_generated = sum(Profit),

n_items_purchased = n_distinct(`Product ID`),

n_times_bought = n_distinct(`Order ID`)) |>

mutate(

p_spending = round(spending/sum(spending),2)) |>

ungroup() |>

arrange(desc(spending)) -> df_country_spent

#Country wise most spending by customer

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`,Country) |>

summarise(
country_spending =sum(Sales)

) |>

summarise(most_purchased_from = first(Country,

order_by = desc(country_spending)) ) -> c_country_spent

#City wise purchase overal

df_eu_storeData |>

group_by(City) |>

summarise(

spending= sum(Sales),

profit_generated = sum(Profit),

n_items_purchased = n_distinct(`Product ID`),

n_times_bought = n_distinct(`Order ID`)) |>

mutate(

p_spending = round(spending/sum(spending),2)) |>

ungroup() |>

arrange(desc(spending)) -> df_city_spent

#Most Shipmode used

df_eu_storeData |>

group_by(`Customer ID`,`Customer Name`,`Ship Mode`) |>

summarise(

counts = table(`Ship Mode`)) |>

mutate(

max_shipmode_used = names(counts)[which.max(counts)]) -> shipmode_data


#Maximum shipmode used overal

shipmode_data |>

ungroup() |>

group_by(`Ship Mode`) |>

summarise(

counts = table(`Ship Mode`)) |>

mutate(

max_shipmode_used =names(counts)[which.max(counts)],

p_shipmode_used = round(counts/sum(counts),2)

) -> overal_max_shiping_used

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`) |>

summarise(n_shipmode_used = n_distinct(`Ship Mode`),

prefered_shipmode = max(`Ship Mode`)) -> n_times_shipmode_used

#segment wise sale and profit

df_eu_storeData |>

group_by(Segment) |>

summarise(

total_segment_sale = sum(Sales),

total_segment_profit = sum(Profit)) |>

mutate(

p_total_segment_sale = round(total_segment_sale/sum(total_segment_sale),2),

p_total_segment_profit = round(total_segment_profit/sum(total_segment_profit),2),

) ->segment_sale
# Median of Customer Transactions

df_eu_storeData |>

group_by(`Customer ID`,`Customer Name`, `Order ID`) |>

summarise(

date_order = first(converted_date))|>

mutate(

days_since_last_visit = as.numeric(date_order - lag(date_order, order_by = date_order)),

avg_visit_days = median(days_since_last_visit, na.rm = TRUE)) |>

summarise(median_days_bw_transactions = first(avg_visit_days)) -> median_bw_orders

left_join(c_data, median_bw_orders) -> df_rfm_customer

#Dividing customers into different Intervals and Scoring them using RFM model

df_rfm_customer |>

ungroup() |>

mutate(

interval_recency = cut2(recency, g = 5),

interval_frequency = cut2(n_transactions, g = 5),

interval_spending = cut2(total_spent, g = 5),

score_recency = as.integer(fct_rev(interval_recency)),

score_frequency = as.integer(interval_frequency),

score_spending = as.integer(interval_spending),

RFMScore = (score_recency * 100) + (score_frequency * 10) +

(score_spending),

rfm_segment = case_when(

#Recent purchase, frequent transactions, high spending.

RFMScore %in% c(555, 554, 544, 545, 454, 455, 445) ~ "Champions",

#Often spend good money buying your products. Responsive to promotions

RFMScore %in% c(543, 444, 435, 355, 354, 345, 344, 335) ~ "Loyal Customers",
#Recent customers but spent a good amount and bought more than once

RFMScore %in% c(553, 551, 552, 541, 542, 533, 532, 531, 452, 451, 442, 441, 431, 453, 433, 432, 423,
353, 352, 351, 342, 341, 333, 323) ~ "Potential Loyalists",

#Bought most recently, but not often

RFMScore %in% c(512, 511, 422, 421, 412, 411, 311) ~ "Recent Customers",

#Recent shoppers but haven’t spent much

RFMScore %in% c(525, 524, 523, 522, 521, 515, 514, 513, 425, 424, 413, 414, 415, 315, 314, 313) ~
"Promising",

#Above-average recency, frequency and monetary values. They may not have bought very recently
though

RFMScore %in% c(535, 534, 443, 434, 343, 334, 325, 324) ~ "Needs Attention",

#Below average recency, frequency, and monetary values. Will lose them if not reactivated

RFMScore %in% c(331, 321, 312, 221, 213) ~ "About to Sleep",

#They spent big money and purchased often. But the last purchase was a long time ago

RFMScore %in% c(255, 254, 245, 244, 253, 252, 243, 242, 235, 234, 225, 224, 153, 152, 145, 143, 142,
135, 134, 133, 125, 124) ~ "At Risk",

#Often made the biggest purchases but they haven’t returned for a long time

RFMScore %in% c(155, 154, 144, 214, 215, 115, 114, 113) ~ "Can’t lose",

#The last purchase was long ago. Low spenders with a low number of orders

RFMScore %in% c(332, 322, 231, 241, 251, 233, 232, 223, 222, 132, 123, 122, 212, 211) ~
"Hibernating",

#Lowest recency, frequency, and monetary scores

RFMScore %in% c(111, 112, 121, 131, 141, 151) ~ "Lost"

)) -> rfm_matrix

rfm_matrix |> view()

#Customer segmentation on profit basis

df_eu_storeData |>

group_by(`Customer ID`, `Customer Name`) |>

summarise(
c_profit_generated = sum(Profit)) |>

ungroup() |>

mutate(profit_intervals = cut2(c_profit_generated, g=5),

p_score = as.integer(profit_intervals),

customer_Type = case_when(

p_score ==5 ~ "High Profitable",

p_score ==4 ~ "Profitable",

p_score ==3 ~ "Average Profitable",

p_score ==2 ~ "Below Average Profitable",

p_score ==1 ~ "Low Profitable"

)) -> customer_type_profit_based

merged_customer_data <- merge(merge(merge(merge(merge(merge(merge(c_data, df_category_spent,


all = TRUE), c_country_spent, all = TRUE), n_times_shipmode_used,

all = TRUE), df_rfm_customer, all = TRUE), rfm_matrix, all = TRUE),


customer_type_profit_based, all = TRUE), df_discounted_customers, all = TRUE)

#write_rds(sales_by_customer_year, "Cohort-6/porfolio project/df_customer_year_sale.rds")

You might also like