Professional Documents
Culture Documents
Product
Product
Product
def process_cart_data(pic_df):
groups = ['product', 'selector', 'source', 'country', 'selector_country',
'formatted_date']
df = pic_df.groupby(groups).agg(sum('eventValue').alias('total_added'))
df = df.drop_duplicates()
return df
def find_top_product_combos(pic_df):
groups = ['cart_id', 'selector', 'source', 'country', 'selector_country',
'formatted_date']
prod_df = pic_df.orderBy(['cart_id', 'product'])
fcart_df = pcart_df.where(pcart_df.product_combinations.contains(","))
pdf = prod_df.drop_duplicates(['cart_id'])
mcart_df = fcart_df.join(pdf, fcart_df.cart_id == prod_df.cart_id, "inner")
def find_products_in_combo(pic_df):
prod_df = pic_df.orderBy(['cart_id', 'product'])
pcart_df = prod_df.select(['cart_id', 'product']).groupby('cart_id'). \
agg(collect_list('product').alias('product_combinations')). \
select('cart_id', concat_ws(',',
'product_combinations').alias('product_combinations'))
fcart_df = pcart_df.where(pcart_df.product_combinations.contains(","))
ucart_df = fcart_df.drop_duplicates(["product_combinations"])
updf = prod_df.select(["cart_id", "product"]).drop_duplicates()
tcart_df = ucart_df.join(updf, "cart_id", "inner")
cart_df = tcart_df.select(["product_combinations",
"product"]).drop_duplicates()
return cart_df
def find_other_products(prod_df):
ogroups = ['selector', 'country', 'selector_country', 'source',
'formatted_date', 'product', 'other_product']
spark = SparkSession \
.builder \
.appName("Snc Product Data Processor") \
.getOrCreate()
s3_base_path = "#{EventDataBasePath}"
cart_products_table = s3_base_path +
"events/raw/products_in_cart_#{StartDate}_#{EndDate}"
parsed_cart_products_table = s3_base_path +
"events/parsed/products_in_cart_#{StartDate}_#{EndDate}"
processed_cart_table = s3_base_path +
"events/processed/products_in_cart_#{StartDate}_#{EndDate}"
top_prod_table = s3_base_path +
"events/processed/product_combinations_#{StartDate}_#{EndDate}"
prod_in_combo_table = s3_base_path +
"events/processed/products_in_combo_#{StartDate}_#{EndDate}"
other_prod_table = s3_base_path +
"events/processed/other_products_#{StartDate}_#{EndDate}"
prod_df = spark.read.format("delta").load(cart_products_table)
prod_df = parse_cart_data(prod_df, 'add product', parsed_cart_products_table)
processed_prod_df = process_cart_data(prod_df)
processed_prod_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(processed_cart_table)
top_prod_df = find_top_product_combos(prod_df)
top_prod_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(top_prod_table)
prod_in_combo_df = find_products_in_combo(prod_df)
prod_in_combo_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(prod_in_combo_table)
oth_df = find_other_products(prod_df)
oth_df.write.format("delta").option("overwriteSchema",
"true").mode("overwrite").save(other_prod_table)