Product

You might also like

Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 3

from pyspark.

sql import SparkSession


from pyspark.sql.functions import regexp_replace, lit, col, to_date, sum,
collect_list, concat_ws
from functools import reduce

def parse_cart_data(df, name=None, table_name=None):


df = df.withColumnRenamed("dimension2",
"selector").withColumnRenamed("dimension3", "selector_country"). \
withColumnRenamed("dimension7",
"session_id").withColumnRenamed("dimension9", "cart_id"). \
withColumnRenamed("eventLabel", "product")
df = df.withColumn("country", regexp_replace(col("country"), "(not set)",
"Unknown"))
df = df.filter(~df.selector.rlike(".*(,|;|\[).*"))
df = df.withColumn("formatted_date", to_date("date", "yyyyMMdd"))
df = df.na.fill("NA", "session_id")
df = df.withColumn("totalEvents", col("totalEvents").astype("int"))
df = df.withColumn("eventAction", lit(name))
df = df.drop_duplicates()
df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(table_name)
return df

def process_cart_data(pic_df):
groups = ['product', 'selector', 'source', 'country', 'selector_country',
'formatted_date']
df = pic_df.groupby(groups).agg(sum('eventValue').alias('total_added'))
df = df.drop_duplicates()
return df

def find_top_product_combos(pic_df):
groups = ['cart_id', 'selector', 'source', 'country', 'selector_country',
'formatted_date']
prod_df = pic_df.orderBy(['cart_id', 'product'])

pcart_df = prod_df.select(['cart_id', 'product']).groupby('cart_id'). \


agg(collect_list('product').alias('product_combinations')). \
select('cart_id', concat_ws(',',
'product_combinations').alias('product_combinations'))

fcart_df = pcart_df.where(pcart_df.product_combinations.contains(","))
pdf = prod_df.drop_duplicates(['cart_id'])
mcart_df = fcart_df.join(pdf, fcart_df.cart_id == prod_df.cart_id, "inner")

pgroups = ['product_combinations'] + groups[1:]


prod_comb_df = mcart_df.groupby(pgroups).count().withColumnRenamed('count',
'no_of_times_added')
prod_comb_df = prod_comb_df.drop_duplicates()
return prod_comb_df

def find_products_in_combo(pic_df):
prod_df = pic_df.orderBy(['cart_id', 'product'])
pcart_df = prod_df.select(['cart_id', 'product']).groupby('cart_id'). \
agg(collect_list('product').alias('product_combinations')). \
select('cart_id', concat_ws(',',
'product_combinations').alias('product_combinations'))

fcart_df = pcart_df.where(pcart_df.product_combinations.contains(","))
ucart_df = fcart_df.drop_duplicates(["product_combinations"])
updf = prod_df.select(["cart_id", "product"]).drop_duplicates()
tcart_df = ucart_df.join(updf, "cart_id", "inner")
cart_df = tcart_df.select(["product_combinations",
"product"]).drop_duplicates()
return cart_df

def find_other_products(prod_df):
ogroups = ['selector', 'country', 'selector_country', 'source',
'formatted_date', 'product', 'other_product']

dcart_df = prod_df.select(['cart_id', 'product']).withColumnRenamed('product',


'other_product')
oth_df = prod_df.select(['cart_id', 'product']).join(dcart_df, "cart_id",
'inner')

odf = oth_df.where(oth_df.product != oth_df.other_product)


mdf = prod_df.drop_duplicates(['cart_id']).drop('product')

modf = odf.join(mdf, 'cart_id', 'inner')


opdf = modf.groupby(ogroups).count().withColumnRenamed("count",
"no_of_times_added")
opdf = opdf.drop_duplicates()
return opdf

spark = SparkSession \
.builder \
.appName("Snc Product Data Processor") \
.getOrCreate()

s3_base_path = "#{EventDataBasePath}"

cart_products_table = s3_base_path +
"events/raw/products_in_cart_#{StartDate}_#{EndDate}"
parsed_cart_products_table = s3_base_path +
"events/parsed/products_in_cart_#{StartDate}_#{EndDate}"
processed_cart_table = s3_base_path +
"events/processed/products_in_cart_#{StartDate}_#{EndDate}"
top_prod_table = s3_base_path +
"events/processed/product_combinations_#{StartDate}_#{EndDate}"
prod_in_combo_table = s3_base_path +
"events/processed/products_in_combo_#{StartDate}_#{EndDate}"
other_prod_table = s3_base_path +
"events/processed/other_products_#{StartDate}_#{EndDate}"

prod_df = spark.read.format("delta").load(cart_products_table)
prod_df = parse_cart_data(prod_df, 'add product', parsed_cart_products_table)

processed_prod_df = process_cart_data(prod_df)
processed_prod_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(processed_cart_table)

top_prod_df = find_top_product_combos(prod_df)
top_prod_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(top_prod_table)

prod_in_combo_df = find_products_in_combo(prod_df)
prod_in_combo_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(prod_in_combo_table)

oth_df = find_other_products(prod_df)
oth_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(other_prod_table)

You might also like