final_ca

library(tidyverse)
library(dplyr)
library(lubridate)
library(Hmisc)
library(ggplot2)
library(plotly)
#import data into R
eu_storeData <- readxl::read_excel("Cohort-6/EU Superstore_org.xls")
world_cities = read.csv("Cohort-6/worldcities.csv")
#Added new columns and save dataframe into new variable, df_eu_storeData
eu_storeData |>
mutate(converted_date = as.Date(Òrder Date`),
UnitPrice = (Sales/Quantity) / (1- Discount),
n_repeated_transactions = n_distinct(Òrder ID`)/n_distinct(`Customer ID`))-> df_eu_storeData
na.zero <- function (x) {
x[is.na(x)] <- 0
return(x)
#Customer related data
df_eu_storeData |>
group_by(`Customer ID`, `Customer Name`) |>
summarise(
n_transactions = n_distinct(Òrder ID`),
total_spent = sum(Sales),
profit_generated =sum(Profit),
date_first_transaction = min(converted_date),
date_last_transaction = max(converted_date),
n_shipmode_used = n_distinct(`Ship Mode`),
n_items_purchased = n_distinct(`Product ID`),
max_shipmode_used = max(`Ship Mode`),
c_sale_value = total_spent/n_transactions) |>
mutate(
recency = as.numeric(difftime(ymd('2019-01-31'), date_last_transaction)),
customer_retension_time_in_yrs = round((recency/30)/12,2)) -> c_data
left_join(df_eu_storeData, c_data) -> df_clv_data
#yearly sale for individual customer
df_eu_storeData$year <- format(df_eu_storeData$converted_date, "%Y")
df_eu_storeData$month <- format(df_eu_storeData$converted_date, "%Y-%M")
df_eu_storeData %>%
group_by(`Customer ID`,`Customer Name`, year) %>%
summarise(total_sales = sum(Sales)) -> sales_by_customer_year
#Customer Life time value
df_clv_data |>
mutate(
avg_retension_time = median(customer_retension_time_in_yrs, na.rm = TRUE),
avg_sale_value = median(c_sale_value, na.rm = TRUE),
clv = avg_retension_time * c_sale_value * n_repeated_transactions)-> clv_matrix
#Customer Discount availed
df_eu_storeData |>
summarise(
p_discount = sum(Discount)) |>
left_join(
df_eu_storeData |>
filter(Discount > 0) |>
summarise(
discount_availed = sum(Discount) * sum(UnitPrice*Quantity) - sum(Sales),
n_discounted_items_purchased = n_distinct(`Product ID`))
) -> df_discounted_customers
left_join(clv_matrix, df_discounted_customers) -> df_merge1
df_merge1 |>
mutate(
n_discounted_items_purchased = na.zero(n_discounted_items_purchased)
) ->df_merge1
#customer's favorite category in terms of most spending, quantity and n_times bought
df_merge1 |>
group_by(`Customer ID`, `Customer Name`, Category) |>
summarise(
spending= sum(Sales),
quantity = sum(Quantity),
n_times_bought = n_distinct(Òrder ID`)) |>
mutate(
p_spending = round(spending/sum(spending),2),
most_catagory_purchased = max(quantity)) |>
ungroup() |>
arrange(`Customer ID`, desc(spending)) -> c_category_spent
#for individual customer most category purchased
df_eu_storeData |>
group_by(`Customer ID`, `Customer Name`, Category) |>
summarise(quantity = sum(Quantity)) |>
summarise(most_category_purchased = first(Category,
order_by = desc(quantity)) ) -> df_category_spent
left_join(c_category_spent, df_category_spent) -> df_merge_category
#left_join(df_merge1, df_merge_category) -> df_merge2
#NA occurs
#Customer Geographic behaviour
df_eu_storeData |>
group_by(`Customer ID`, `Customer Name`, City) |>

summarise(
quantity = sum(Quantity),
mutate(
p_spending = round(spending/sum(spending),2)) |>
arrange(`Customer ID`, desc(spending)) -> customer_city_spent
#Country wise purchase
df_eu_storeData |>
group_by(Country) |>
summarise(
profit_generated = sum(Profit),
mutate(
ungroup() |>
arrange(desc(spending)) -> df_country_spent
#Country wise most spending by customer
df_eu_storeData |>
group_by(`Customer ID`, `Customer Name`,Country) |>
summarise(
country_spending =sum(Sales)
) |>
summarise(most_purchased_from = first(Country,
order_by = desc(country_spending)) ) -> c_country_spent
#City wise purchase overal
df_eu_storeData |>
group_by(City) |>
summarise(
profit_generated = sum(Profit),
mutate(
ungroup() |>
arrange(desc(spending)) -> df_city_spent
#Most Shipmode used
df_eu_storeData |>
group_by(`Customer ID`,`Customer Name`,`Ship Mode`) |>
summarise(
counts = table(`Ship Mode`)) |>
mutate(
max_shipmode_used = names(counts)[which.max(counts)]) -> shipmode_data

#Maximum shipmode used overal
shipmode_data |>
ungroup() |>
group_by(`Ship Mode`) |>
summarise(
counts = table(`Ship Mode`)) |>
mutate(
max_shipmode_used =names(counts)[which.max(counts)],
p_shipmode_used = round(counts/sum(counts),2)
) -> overal_max_shiping_used
df_eu_storeData |>
summarise(n_shipmode_used = n_distinct(`Ship Mode`),
prefered_shipmode = max(`Ship Mode`)) -> n_times_shipmode_used
#segment wise sale and profit
df_eu_storeData |>
group_by(Segment) |>
summarise(
total_segment_sale = sum(Sales),
total_segment_profit = sum(Profit)) |>
mutate(
p_total_segment_sale = round(total_segment_sale/sum(total_segment_sale),2),
p_total_segment_profit = round(total_segment_profit/sum(total_segment_profit),2),
) ->segment_sale
# Median of Customer Transactions
df_eu_storeData |>
group_by(`Customer ID`,`Customer Name`, Òrder ID`) |>
summarise(
date_order = first(converted_date))|>
mutate(
days_since_last_visit = as.numeric(date_order - lag(date_order, order_by = date_order)),
avg_visit_days = median(days_since_last_visit, na.rm = TRUE)) |>
summarise(median_days_bw_transactions = first(avg_visit_days)) -> median_bw_orders
left_join(c_data, median_bw_orders) -> df_rfm_customer
#Dividing customers into different Intervals and Scoring them using RFM model
df_rfm_customer |>
ungroup() |>
mutate(
interval_recency = cut2(recency, g = 5),
interval_frequency = cut2(n_transactions, g = 5),
interval_spending = cut2(total_spent, g = 5),
score_recency = as.integer(fct_rev(interval_recency)),
score_frequency = as.integer(interval_frequency),
score_spending = as.integer(interval_spending),
RFMScore = (score_recency * 100) + (score_frequency * 10) +
(score_spending),
rfm_segment = case_when(
#Recent purchase, frequent transactions, high spending.
RFMScore %in% c(555, 554, 544, 545, 454, 455, 445) ~ "Champions",
#Often spend good money buying your products. Responsive to promotions
RFMScore %in% c(543, 444, 435, 355, 354, 345, 344, 335) ~ "Loyal Customers",
#Recent customers but spent a good amount and bought more than once
RFMScore %in% c(553, 551, 552, 541, 542, 533, 532, 531, 452, 451, 442, 441, 431, 453, 433, 432, 423,
353, 352, 351, 342, 341, 333, 323) ~ "Potential Loyalists",
#Bought most recently, but not often
RFMScore %in% c(512, 511, 422, 421, 412, 411, 311) ~ "Recent Customers",
#Recent shoppers but haven’t spent much
RFMScore %in% c(525, 524, 523, 522, 521, 515, 514, 513, 425, 424, 413, 414, 415, 315, 314, 313) ~
"Promising",
#Above-average recency, frequency and monetary values. They may not have bought very recently
though
RFMScore %in% c(535, 534, 443, 434, 343, 334, 325, 324) ~ "Needs Attention",
#Below average recency, frequency, and monetary values. Will lose them if not reactivated
RFMScore %in% c(331, 321, 312, 221, 213) ~ "About to Sleep",
#They spent big money and purchased often. But the last purchase was a long time ago
RFMScore %in% c(255, 254, 245, 244, 253, 252, 243, 242, 235, 234, 225, 224, 153, 152, 145, 143, 142,
135, 134, 133, 125, 124) ~ "At Risk",
#Often made the biggest purchases but they haven’t returned for a long time
RFMScore %in% c(155, 154, 144, 214, 215, 115, 114, 113) ~ "Can’t lose",
#The last purchase was long ago. Low spenders with a low number of orders
RFMScore %in% c(332, 322, 231, 241, 251, 233, 232, 223, 222, 132, 123, 122, 212, 211) ~
"Hibernating",
#Lowest recency, frequency, and monetary scores
RFMScore %in% c(111, 112, 121, 131, 141, 151) ~ "Lost"
)) -> rfm_matrix
rfm_matrix |> view()
#Customer segmentation on profit basis
df_eu_storeData |>
summarise(
c_profit_generated = sum(Profit)) |>
ungroup() |>
mutate(profit_intervals = cut2(c_profit_generated, g=5),
p_score = as.integer(profit_intervals),
customer_Type = case_when(
p_score ==5 ~ "High Profitable",
p_score ==4 ~ "Profitable",
p_score ==3 ~ "Average Profitable",
p_score ==2 ~ "Below Average Profitable",
p_score ==1 ~ "Low Profitable"
)) -> customer_type_profit_based
merged_customer_data <- merge(merge(merge(merge(merge(merge(merge(c_data, df_category_spent,

all = TRUE), c_country_spent, all = TRUE), n_times_shipmode_used,
all = TRUE), df_rfm_customer, all = TRUE), rfm_matrix, all = TRUE),

customer_type_profit_based, all = TRUE), df_discounted_customers, all = TRUE)
#write_rds(sales_by_customer_year, "Cohort-6/porfolio project/df_customer_year_sale.rds")

final_ca

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

final_ca

Uploaded by

Copyright:

Available Formats

library(tidyverse)

#import data into R

eu_storeData <- readxl::read_excel("Cohort-6/EU Superstore_org.xls")

mutate(converted_date = as.Date(`Order Date`),

UnitPrice = (Sales/Quantity) / (1- Discount),

n_repeated_transactions = n_distinct(`Order ID`)/n_distinct(`Customer ID`))-> df_eu_storeData

na.zero <- function (x) {

#Customer related data

group_by(`Customer ID`, `Customer Name`) |>

n_transactions = n_distinct(`Order ID`),

n_shipmode_used = n_distinct(`Ship Mode`),

n_items_purchased = n_distinct(`Product ID`),

max_shipmode_used = max(`Ship Mode`),

c_sale_value = total_spent/n_transactions) |>

recency = as.numeric(difftime(ymd('2019-01-31'), date_last_transaction)),

customer_retension_time_in_yrs = round((recency/30)/12,2)) -> c_data

left_join(df_eu_storeData, c_data) -> df_clv_data

#yearly sale for individual customer

df_eu_storeData$year <- format(df_eu_storeData$converted_date, "%Y")

df_eu_storeData$month <- format(df_eu_storeData$converted_date, "%Y-%M")

group_by(`Customer ID`,`Customer Name`, year) %>%

summarise(total_sales = sum(Sales)) -> sales_by_customer_year

#Customer Life time value

avg_retension_time = median(customer_retension_time_in_yrs, na.rm = TRUE),

avg_sale_value = median(c_sale_value, na.rm = TRUE),

clv = avg_retension_time * c_sale_value * n_repeated_transactions)-> clv_matrix

#Customer Discount availed

group_by(`Customer ID`, `Customer Name`) |>

p_discount = sum(Discount)) |>

filter(Discount > 0) |>

group_by(`Customer ID`, `Customer Name`) |>

discount_availed = sum(Discount) * sum(UnitPrice*Quantity) - sum(Sales),

n_discounted_items_purchased = n_distinct(`Product ID`))

left_join(clv_matrix, df_discounted_customers) -> df_merge1

group_by(`Customer ID`, `Customer Name`, Category) |>

n_items_purchased = n_distinct(`Product ID`),

n_times_bought = n_distinct(`Order ID`)) |>

most_catagory_purchased = max(quantity)) |>

arrange(`Customer ID`, desc(spending)) -> c_category_spent

#for individual customer most category purchased

group_by(`Customer ID`, `Customer Name`, Category) |>

summarise(quantity = sum(Quantity)) |>

order_by = desc(quantity)) ) -> df_category_spent

left_join(c_category_spent, df_category_spent) -> df_merge_category

#left_join(df_merge1, df_merge_category) -> df_merge2

#Customer Geographic behaviour

group_by(`Customer ID`, `Customer Name`, City) |>

n_times_bought = n_distinct(`Order ID`)) |>

p_spending = round(spending/sum(spending),2)) |>

arrange(`Customer ID`, desc(spending)) -> customer_city_spent

#Country wise purchase

n_items_purchased = n_distinct(`Product ID`),

n_times_bought = n_distinct(`Order ID`)) |>

p_spending = round(spending/sum(spending),2)) |>

arrange(desc(spending)) -> df_country_spent

#Country wise most spending by customer

group_by(`Customer ID`, `Customer Name`,Country) |>

order_by = desc(country_spending)) ) -> c_country_spent

#City wise purchase overal

n_items_purchased = n_distinct(`Product ID`),

n_times_bought = n_distinct(`Order ID`)) |>

p_spending = round(spending/sum(spending),2)) |>

arrange(desc(spending)) -> df_city_spent

#Most Shipmode used