KDD Lab 7 2214

KDD & Data Mining
Lab Experiment No 7 : FP Growth Algorithm
Name:-Prathamesh More
PRN:- 20200802214
Aim : To perform FP Growth Algorithm on the given dataset (market-basket-optimisation.csv) using
1. By creating functions.
2. By using NumPy and FP Growth library.
And validating the results
By Creating Function
In [ ]: import numpy as np
import pandas as pd
from future import division, print_function
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')
In [ ]: %pip install mlxtend --upgrade

In [ ]: # Define the Node class for the FP-Tree
class Node:
def init (self, item, count, parent):
self.item = item
self.count = count
self.parent = parent
self.children = {}
def increment(self, count):

self.count += count
# Create the FP-Tree

def create_fp_tree(data, min_support):
# Count the frequency of each item in the dataset
item_counts = {}
for transaction in data:
for item in transaction:
if item in item_counts:
item_counts[item] += 1
else:
item_counts[item] = 1
# Remove infrequent items from the dataset

data = [[item for item in transaction if item_counts[item] >= min_support] for trans
Loading [MathJax]/extensions/Safe.js
# Sort the items in each transaction by their frequency
data = [sorted(transaction, key=lambda item: item_counts[item], reverse=True) for tr
# Create the root node of the FP-Tree

root = Node(None, 0, None)
# Add each transaction to the FP-Tree

for transaction in data:
current_node = root
for item in transaction:
if item in current_node.children:
child_node = current_node.children[item]
child_node.increment(1)
else:
child_node = Node(item, 1, current_node)
current_node.children[item] = child_node
current_node = child_node
return root, item_counts
In [ ]: # Define the FP-Growth algorithm

def fp_growth(data, min_support):
# Create the FP-Tree
root, item_counts = create_fp_tree(data, min_support)
# Mine the FP-Tree for frequent itemsets

itemset_list = []
mine_fp_tree(root, [], itemset_list, min_support)
# Return the frequent itemsets and their counts

return itemset_list
# Define the function to recursively mine the FP-Tree for frequent itemsets
def mine_fp_tree(node, prefix, itemset_list, min_support):
if node.count >= min_support:
itemset = prefix + [node.item]
itemset_list.append((itemset, node.count))
for child_node in node.children.values():
mine_fp_tree(child_node, prefix + [node.item], itemset_list, min_support)
In [ ]: df = pd.read_csv('/content/Market_Basket_Optimisation.csv', header=None)
transaction = []
for i in df.itertuples():
l = set(list(i))
transaction.append([i for i in l if (str(i)!="nan" and type(i)!=int)])
len(transaction)
7501
Out[ ]:
In [ ]: itemsets = fp_growth(transaction,150)
itemsets
[([None, 'mineral water'], 1788),
Out[ ]:
([None, 'mineral water', 'eggs'], 382),
([None, 'mineral water', 'spaghetti'], 341),
([None, 'mineral water', 'chocolate'], 174),
([None, 'eggs'], 966),
([None, 'eggs', 'french fries'], 184),
([None, 'eggs', 'spaghetti'], 167),
([None, 'french fries'], 714),
([None, 'spaghetti'], 691),
([None, 'cookies'], 305),
([None, 'chocolate'], 434),
([None, 'green tea'], 360),
([None, 'escalope'], 177),
([None, 'milk'], 215)]
Using Libraries in-built func
In [ ]: df = pd.read_csv("/content/Market_Basket_Optimisation.csv", names=[i for i in range(20)]

df
Out[ ]: 0 1 2 3 4 5 6 7 8 9 10 11
whole low
vegetables green cottage energy tomato green
0 shrimp almonds avocado weat yams fat
mix grapes cheese drink juice tea
flour yogurt
1 burgers meatballs eggs NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 chutney NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 turkey avocado NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
mineral energy whole

4 milk green tea NaN NaN NaN NaN NaN NaN NaN
water bar wheat rice
... ... ... ... ... ... ... ... ... ... ... ... ...
fresh
7496 butter light mayo NaN NaN NaN NaN NaN NaN NaN NaN NaN
bread
frozen french green

7497 burgers eggs magazines NaN NaN NaN NaN NaN NaN
vegetables fries tea
7498 chicken NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7499 escalope green tea NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
frozen yogurt low fat NaN NaN NaN NaN NaN NaN NaN NaN
7500 eggs
smoothie cake yogurt
7501 rows × 20 columns
In [ ]: transaction = []
for i in df.itertuples():
l = set(list(i))
transaction.append([i for i in l if (str(i)!="nan" and type(i)!=int)])
len(transaction)
7501
Out[ ]:
In [ ]: t = TransactionEncoder()
t_arr = t.fit_transform(transaction)
data = pd.DataFrame(t_arr, columns=t.columns_)

data
Out[ ]: antioxydant babies barbecue black
asparagus almonds asparagus avocado bacon blueberries ...
juice food sauce tea
0 False True True False True False False False False False ...
1 False False False False False False False False False False ...
3 False False False False True False False False False False ...
... ... ... ... ... ... ... ... ... ... ... ...
7501 rows × 120 columns
In [ ]: res = fpgrowth(data, min_support=0.05, use_colnames=True)

res
Out[ ]: support itemsets
0 0.238368 (mineral water)
1 0.132116 (green tea)
2 0.076523 (low fat yogurt)
3 0.071457 (shrimp)
4 0.065858 (olive oil)
5 0.063325 (frozen smoothie)
6 0.179709 (eggs)
7 0.087188 (burgers)
8 0.062525 (turkey)
9 0.129583 (milk)
10 0.058526 (whole wheat rice)
11 0.170911 (french fries)
12 0.050527 (soup)
13 0.174110 (spaghetti)
14 0.095321 (frozen vegetables)
15 0.080389 (cookies)
16 0.051060 (cooking oil)
17 0.163845 (chocolate)
18 0.059992 (chicken)
19 0.068391 (tomatoes)
20 0.095054 (pancakes)
21 0.052393 (grated cheese)
22 0.098254 (ground beef)
23 0.079323 (escalope)
24 0.081056 (cake)
25 0.050927 (mineral water, eggs)
26 0.059725 (mineral water, spaghetti)
27 0.052660 (mineral water, chocolate)
In [ ]: res = association_rules(res,metric="confidence", min_threshold=0.06)

res
Out[ ]: antecedent consequent
antecedents consequents support confidence lift leverage conviction
support support
(mineral
0 (eggs) 0.238368 0.179709 0.050927 0.213647 1.188845 0.008090 1.043158
water)
(mineral
1 (eggs) 0.179709 0.238368 0.050927 0.283383 1.188845 0.008090 1.062815
water)
(mineral
2 (spaghetti) 0.238368 0.174110 0.059725 0.250559 1.439085 0.018223 1.102008
water)
(mineral
3 (spaghetti) 0.174110 0.238368 0.059725 0.343032 1.439085 0.018223 1.159314
water)
(mineral
4 (chocolate) 0.238368 0.163845 0.052660 0.220917 1.348332 0.013604 1.073256
water)
(mineral
5 (chocolate) 0.163845 0.238368 0.052660 0.321400 1.348332 0.013604 1.122357
water)

KDD Lab 7 2214

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

KDD Lab 7 2214

Uploaded by

Copyright:

Available Formats

KDD & Data Mining

Lab Experiment No 7 : FP Growth Algorithm

Aim : To perform FP Growth Algorithm on the given dataset (market-basket-optimisation.csv) using

And validating the results

In [ ]: %pip install mlxtend --upgrade

def increment(self, count):

# Create the FP-Tree

# Remove infrequent items from the dataset

# Create the root node of the FP-Tree

# Add each transaction to the FP-Tree

return root, item_counts

In [ ]: # Define the FP-Growth algorithm

# Mine the FP-Tree for frequent itemsets

# Return the frequent itemsets and their counts

Using Libraries in-built func

In [ ]: df = pd.read_csv("/content/Market_Basket_Optimisation.csv", names=[i for i in range(20)]

mineral energy whole

frozen french green

7501 rows × 20 columns

data = pd.DataFrame(t_arr, columns=t.columns_)

7501 rows × 120 columns

In [ ]: res = fpgrowth(data, min_support=0.05, use_colnames=True)

0 0.238368 (mineral water)

1 0.132116 (green tea)

2 0.076523 (low fat yogurt)

4 0.065858 (olive oil)

5 0.063325 (frozen smoothie)

10 0.058526 (whole wheat rice)

11 0.170911 (french fries)

14 0.095321 (frozen vegetables)

16 0.051060 (cooking oil)

21 0.052393 (grated cheese)

22 0.098254 (ground beef)

25 0.050927 (mineral water, eggs)

26 0.059725 (mineral water, spaghetti)

27 0.052660 (mineral water, chocolate)

In [ ]: res = association_rules(res,metric="confidence", min_threshold=0.06)

You might also like