Download as pdf or txt
Download as pdf or txt
You are on page 1of 17

3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [1]: import numpy as np


import pandas as pd

In [3]: transactions = pd.DataFrame({


'TransactionID': np.arange(10)+1,
'TransactionDate': pd.to_datetime(['2010-08-21', '2011-05-26', '2011-06-16
'2013-12-23', '2013-12-30', '2014-04-24', '2015-04-24', '2016-05-08'
]).date,
'UserID': [7, 3, 3, 1, 2, 2, 3, np.nan, 7, 3],
'ProductID': [2, 4, 3, 2, 4, 5, 4, 2, 4, 4],
'Quantity': [1, 1, 1, 3, 1, 6, 1, 3, 3, 4]
})

In [4]: transactions = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWra


In [5]: transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 TransactionID 10 non-null int64
1 TransactionDate 10 non-null object
2 UserID 9 non-null float64
3 ProductID 10 non-null int64
4 Quantity 10 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 532.0+ bytes

In [6]: transactions.shape[0]

Out[6]: 10

In [7]: transactions.shape[1]

Out[7]: 5

In [8]: transactions.index.values

Out[8]: array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [9]: transactions.columns.values

Out[9]: array(['TransactionID', 'TransactionDate', 'UserID', 'ProductID',


'Quantity'], dtype=object)

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 1/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [10]: transactions.rename(columns={'Quantity': 'Quant'})

Out[10]: TransactionID TransactionDate UserID ProductID Quant

0 1 2010-08-21 7.0 2 1

1 2 2011-05-26 3.0 4 1

2 3 2011-06-16 3.0 3 1

3 4 2012-08-26 1.0 2 3

4 5 2013-06-06 2.0 4 1

5 6 2013-12-23 2.0 5 6

6 7 2013-12-30 3.0 4 1

7 8 2014-04-24 NaN 2 3

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

In [11]: transactions.sort_values('TransactionID', ascending=False)

Out[11]: TransactionID TransactionDate UserID ProductID Quantity

9 10 2016-05-08 3.0 4 4

8 9 2015-04-24 7.0 4 3

7 8 2014-04-24 NaN 2 3

6 7 2013-12-30 3.0 4 1

5 6 2013-12-23 2.0 5 6

4 5 2013-06-06 2.0 4 1

3 4 2012-08-26 1.0 2 3

2 3 2011-06-16 3.0 3 1

1 2 2011-05-26 3.0 4 1

0 1 2010-08-21 7.0 2 1

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 2/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [12]: transactions.sort_values(['Quantity', 'TransactionDate'], ascending=[True, F


Out[12]: TransactionID TransactionDate UserID ProductID Quantity

6 7 2013-12-30 3.0 4 1

4 5 2013-06-06 2.0 4 1

2 3 2011-06-16 3.0 3 1

1 2 2011-05-26 3.0 4 1

0 1 2010-08-21 7.0 2 1

8 9 2015-04-24 7.0 4 3

7 8 2014-04-24 NaN 2 3

3 4 2012-08-26 1.0 2 3

9 10 2016-05-08 3.0 4 4

5 6 2013-12-23 2.0 5 6

In [13]: transactions[['ProductID', 'Quantity', 'TransactionDate', 'TransactionID',


Out[13]: ProductID Quantity TransactionDate TransactionID UserID

0 2 1 2010-08-21 1 7.0

1 4 1 2011-05-26 2 3.0

2 3 1 2011-06-16 3 3.0

3 2 3 2012-08-26 4 1.0

4 4 1 2013-06-06 5 2.0

5 5 6 2013-12-23 6 2.0

6 4 1 2013-12-30 7 3.0

7 2 3 2014-04-24 8 NaN

8 4 3 2015-04-24 9 7.0

9 4 4 2016-05-08 10 3.0

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 3/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [14]: transactions[pd.unique(['UserID'] + transactions.columns.values.tolist()).to


Out[14]: UserID TransactionID TransactionDate ProductID Quantity

0 7.0 1 2010-08-21 2 1

1 3.0 2 2011-05-26 4 1

2 3.0 3 2011-06-16 3 1

3 1.0 4 2012-08-26 2 3

4 2.0 5 2013-06-06 4 1

5 2.0 6 2013-12-23 5 6

6 3.0 7 2013-12-30 4 1

7 NaN 8 2014-04-24 2 3

8 7.0 9 2015-04-24 4 3

9 3.0 10 2016-05-08 4 4

In [15]: transactions.values[:, 0]

Out[15]: array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=object)

In [16]: transactions.values[:, 0]

Out[16]: array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=object)

In [17]: col = "ProductID"


transactions[[col]].values[:, 0]

Out[17]: array([2, 4, 3, 2, 4, 5, 4, 2, 4, 4], dtype=int64)

In [18]: transactions.iloc[[0,2,5]]

Out[18]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

2 3 2011-06-16 3.0 3 1

5 6 2013-12-23 2.0 5 6

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 4/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [19]: transactions.drop([0,2,5], axis=0)


Out[19]: TransactionID TransactionDate UserID ProductID Quantity

1 2 2011-05-26 3.0 4 1

3 4 2012-08-26 1.0 2 3

4 5 2013-06-06 2.0 4 1

6 7 2013-12-30 3.0 4 1

7 8 2014-04-24 NaN 2 3

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

In [20]: transactions[:3]
transactions.head(3)

Out[20]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

1 2 2011-05-26 3.0 4 1

2 3 2011-06-16 3.0 3 1

In [21]: transactions[3:]
transactions.tail(-3)

Out[21]: TransactionID TransactionDate UserID ProductID Quantity

3 4 2012-08-26 1.0 2 3

4 5 2013-06-06 2.0 4 1

5 6 2013-12-23 2.0 5 6

6 7 2013-12-30 3.0 4 1

7 8 2014-04-24 NaN 2 3

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

In [22]: transactions.tail(2)

Out[22]: TransactionID TransactionDate UserID ProductID Quantity

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 5/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [23]: transactions.tail(-2)

Out[23]: TransactionID TransactionDate UserID ProductID Quantity

2 3 2011-06-16 3.0 3 1

3 4 2012-08-26 1.0 2 3

4 5 2013-06-06 2.0 4 1

5 6 2013-12-23 2.0 5 6

6 7 2013-12-30 3.0 4 1

7 8 2014-04-24 NaN 2 3

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

In [24]: transactions[transactions.Quantity > 1]


Out[24]: TransactionID TransactionDate UserID ProductID Quantity

3 4 2012-08-26 1.0 2 3

5 6 2013-12-23 2.0 5 6

7 8 2014-04-24 NaN 2 3

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

In [25]: transactions[transactions.UserID == 2]

Out[25]: TransactionID TransactionDate UserID ProductID Quantity

4 5 2013-06-06 2.0 4 1

5 6 2013-12-23 2.0 5 6

In [26]: transactions[(transactions.Quantity > 1) & (transactions.UserID == 2)]


Out[26]: TransactionID TransactionDate UserID ProductID Quantity

5 6 2013-12-23 2.0 5 6

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 6/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [27]: transactions[transactions.Quantity + transactions.UserID > 3]


Out[27]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

1 2 2011-05-26 3.0 4 1

2 3 2011-06-16 3.0 3 1

3 4 2012-08-26 1.0 2 3

5 6 2013-12-23 2.0 5 6

6 7 2013-12-30 3.0 4 1

8 9 2015-04-24 7.0 4 3

9 10 2016-05-08 3.0 4 4

In [28]: foo = np.array([True, False, True, False, True, False, True, False, True, F
transactions[foo]

Out[28]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

2 3 2011-06-16 3.0 3 1

4 5 2013-06-06 2.0 4 1

6 7 2013-12-30 3.0 4 1

8 9 2015-04-24 7.0 4 3

In [30]: bar = np.array([1, -3, 2, 2, 0, -4, -4, 0, 0, 2])


transactions[bar > 0]

Out[30]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

2 3 2011-06-16 3.0 3 1

3 4 2012-08-26 1.0 2 3

9 10 2016-05-08 3.0 4 4

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 7/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [31]: transactions[foo | (bar < 0)]


Out[31]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

1 2 2011-05-26 3.0 4 1

2 3 2011-06-16 3.0 3 1

4 5 2013-06-06 2.0 4 1

5 6 2013-12-23 2.0 5 6

6 7 2013-12-30 3.0 4 1

8 9 2015-04-24 7.0 4 3

In [32]: transactions[~foo & (bar >= 0)]


Out[32]: TransactionID TransactionDate UserID ProductID Quantity

3 4 2012-08-26 1.0 2 3

7 8 2014-04-24 NaN 2 3

9 10 2016-05-08 3.0 4 4

In [33]: transactions.iloc[:, [0, 2]]


Out[33]: TransactionID UserID

0 1 7.0

1 2 3.0

2 3 3.0

3 4 1.0

4 5 2.0

5 6 2.0

6 7 3.0

7 8 NaN

8 9 7.0

9 10 3.0

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 8/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [34]: transactions[['TransactionID', 'TransactionDate']]


Out[34]: TransactionID TransactionDate

0 1 2010-08-21

1 2 2011-05-26

2 3 2011-06-16

3 4 2012-08-26

4 5 2013-06-06

5 6 2013-12-23

6 7 2013-12-30

7 8 2014-04-24

8 9 2015-04-24

9 10 2016-05-08

In [36]: transactions.loc[transactions.TransactionID > 5, ['TransactionID', 'Transact


Out[36]: TransactionID TransactionDate

5 6 2013-12-23

6 7 2013-12-30

7 8 2014-04-24

8 9 2015-04-24

9 10 2016-05-08

In [37]: cols = ["TransactionID", "UserID", "Quantity"]


transactions[cols]

Out[37]: TransactionID UserID Quantity

0 1 7.0 1

1 2 3.0 1

2 3 3.0 1

3 4 1.0 3

4 5 2.0 1

5 6 2.0 6

6 7 3.0 1

7 8 NaN 3

8 9 7.0 3

9 10 3.0 4

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 9/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [38]: cols = ["TransactionID", "UserID", "Quantity"]


transactions.drop(cols, axis=1)

Out[38]: TransactionDate ProductID

0 2010-08-21 2

1 2011-05-26 4

2 2011-06-16 3

3 2012-08-26 2

4 2013-06-06 4

5 2013-12-23 5

6 2013-12-30 4

7 2014-04-24 2

8 2015-04-24 4

9 2016-05-08 4

In [39]: transactions['TransactionDate'] = pd.to_datetime(transactions.TransactionDat


In [40]: transactions['Foo'] = transactions.UserID + transactions.ProductID


In [41]: transactions.loc[transactions.TransactionID % 2 == 0, 'Foo'] = np.nan

In [42]: transactions.TransactionID = transactions.TransactionID + 100


transactions.TransactionID = transactions.TransactionID - 100

In [43]: transactions['RowIdx'] = np.arange(transactions.shape[0])


In [44]: transactions['QuantityRk'] = transactions.Quantity.rank(method='average')


transactions['QuantityMin'] = transactions.Quantity.min()
transactions['QuantityMax'] = transactions.Quantity.max()

In [45]: transactions.drop('Foo', axis=1, inplace=True)


In [46]: transactions.drop(['QuantityRk', 'QuantityMin', 'QuantityMax'], axis=1, inpl


localhost:8888/notebooks/Data-Visualization-05-03.ipynb 10/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [47]: transactions.groupby('UserID').apply(lambda x: pd.Series(dict(Transactions=x

Out[47]: UserID Transactions

0 1.0 1

1 2.0 2

2 3.0 4

3 7.0 2

In [48]: transactions.groupby('UserID').apply(lambda x: pd.Series(dict(Transactions=x


 
Out[48]: UserID Transactions QuantityAvg

0 1.0 1.0 3.00

1 2.0 2.0 3.50

2 3.0 4.0 1.75

3 7.0 2.0 2.00

In [49]: users = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWrangling/


sessions = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWrangli
products = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWrangli
transactions = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWra

In [50]: users['Registered'] = pd.to_datetime(users.Registered)


users['Cancelled'] = pd.to_datetime(users.Cancelled)
transactions['TransactionDate'] = pd.to_datetime(transactions.TransactionDat

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 11/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [51]: transactions.merge(users, how='left', on='UserID')


Out[51]: TransactionID TransactionDate UserID ProductID Quantity User Gender Registered

0 1 2010-08-21 7.0 2 1 NaN NaN NaT

1 2 2011-05-26 3.0 4 1 Caroline female 2012-10-23

2 3 2011-06-16 3.0 3 1 Caroline female 2012-10-23

3 4 2012-08-26 1.0 2 3 Charles male 2012-12-21

4 5 2013-06-06 2.0 4 1 Pedro male 2010-08-01

5 6 2013-12-23 2.0 5 6 Pedro male 2010-08-01

6 7 2013-12-30 3.0 4 1 Caroline female 2012-10-23

7 8 2014-04-24 NaN 2 3 NaN NaN NaT

8 9 2015-04-24 7.0 4 3 NaN NaN NaT

9 10 2016-05-08 3.0 4 4 Caroline female 2012-10-23

 

In [52]: transactions[~transactions['UserID'].isin(users['UserID'])]

Out[52]: TransactionID TransactionDate UserID ProductID Quantity

0 1 2010-08-21 7.0 2 1

7 8 2014-04-24 NaN 2 3

8 9 2015-04-24 7.0 4 3

In [53]: transactions.merge(users, how='inner', on='UserID')


Out[53]: TransactionID TransactionDate UserID ProductID Quantity User Gender Registered

0 2 2011-05-26 3.0 4 1 Caroline female 2012-10-23

1 3 2011-06-16 3.0 3 1 Caroline female 2012-10-23

2 7 2013-12-30 3.0 4 1 Caroline female 2012-10-23

3 10 2016-05-08 3.0 4 4 Caroline female 2012-10-23

4 4 2012-08-26 1.0 2 3 Charles male 2012-12-21

5 5 2013-06-06 2.0 4 1 Pedro male 2010-08-01

6 6 2013-12-23 2.0 5 6 Pedro male 2010-08-01

 

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 12/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [54]: transactions.merge(users, how='outer', on='UserID')

Out[54]: TransactionID TransactionDate UserID ProductID Quantity User Gender Registere

0 1.0 2010-08-21 7.0 2.0 1.0 NaN NaN Na

1 9.0 2015-04-24 7.0 4.0 3.0 NaN NaN Na

2 2.0 2011-05-26 3.0 4.0 1.0 Caroline female 2012-10-2

3 3.0 2011-06-16 3.0 3.0 1.0 Caroline female 2012-10-2

4 7.0 2013-12-30 3.0 4.0 1.0 Caroline female 2012-10-2

5 10.0 2016-05-08 3.0 4.0 4.0 Caroline female 2012-10-2

6 4.0 2012-08-26 1.0 2.0 3.0 Charles male 2012-12-2

7 5.0 2013-06-06 2.0 4.0 1.0 Pedro male 2010-08-0

8 6.0 2013-12-23 2.0 5.0 6.0 Pedro male 2010-08-0

9 8.0 2014-04-24 NaN 2.0 3.0 NaN NaN Na

10 NaN NaT 4.0 NaN NaN Brielle female 2013-07-1

11 NaN NaT 5.0 NaN NaN Benjamin male 2010-11-2

 

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 13/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [55]: df1 = pd.DataFrame({'key': np.repeat(1, users.shape[0]), 'UserID': users.Use


df2 = pd.DataFrame({'key': np.repeat(1, products.shape[0]), 'ProductID': pro
pd.merge(df1, df2,on='key')[['UserID', 'ProductID']]

Out[55]: UserID ProductID

0 1 1

1 1 2

2 1 3

3 1 4

4 1 5

5 2 1

6 2 2

7 2 3

8 2 4

9 2 5

10 3 1

11 3 2

12 3 3

13 3 4

14 3 5

15 4 1

16 4 2

17 4 3

18 4 4

19 4 5

20 5 1

21 5 2

22 5 3

23 5 4

24 5 5

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 14/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [56]: df2 = pd.DataFrame({'key': np.repeat(1, products.shape[0]), 'ProductID': pro


user_products = pd.merge(df1, df2,on='key')[['UserID', 'ProductID']]
pd.merge(user_products, transactions, how='left', on=['UserID', 'ProductID'

 
Out[56]: UserID ProductID Quantity

0 1 1 0.0

1 1 2 3.0

2 1 3 0.0

3 1 4 0.0

4 1 5 0.0

5 2 1 0.0

6 2 2 0.0

7 2 3 0.0

8 2 4 1.0

9 2 5 6.0

10 3 1 0.0

11 3 2 0.0

12 3 3 1.0

13 3 4 6.0

14 3 5 0.0

15 4 1 0.0

16 4 2 0.0

17 4 3 0.0

18 4 4 0.0

19 4 5 0.0

20 5 1 0.0

21 5 2 0.0

22 5 3 0.0

23 5 4 0.0

24 5 5 0.0

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 15/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [57]: pd.merge(transactions, transactions, on='UserID')

Out[57]: TransactionID_x TransactionDate_x UserID ProductID_x Quantity_x TransactionID_y Tra

0 1 2010-08-21 7.0 2 1 1

1 1 2010-08-21 7.0 2 1 9

2 9 2015-04-24 7.0 4 3 1

3 9 2015-04-24 7.0 4 3 9

4 2 2011-05-26 3.0 4 1 2

5 2 2011-05-26 3.0 4 1 3

6 2 2011-05-26 3.0 4 1 7

7 2 2011-05-26 3.0 4 1 10

8 3 2011-06-16 3.0 3 1 2

9 3 2011-06-16 3.0 3 1 3

10 3 2011-06-16 3.0 3 1 7

11 3 2011-06-16 3.0 3 1 10

12 7 2013-12-30 3.0 4 1 2

13 7 2013-12-30 3.0 4 1 3

14 7 2013-12-30 3.0 4 1 7

15 7 2013-12-30 3.0 4 1 10

16 10 2016-05-08 3.0 4 4 2

17 10 2016-05-08 3.0 4 4 3

18 10 2016-05-08 3.0 4 4 7

19 10 2016-05-08 3.0 4 4 10

20 4 2012-08-26 1.0 2 3 4

21 5 2013-06-06 2.0 4 1 5

22 5 2013-06-06 2.0 4 1 6

23 6 2013-12-23 2.0 5 6 5

24 6 2013-12-23 2.0 5 6 6

25 8 2014-04-24 NaN 2 3 8

 

In [58]: pd.merge(users, transactions.groupby('UserID').first().reset_index(), how='l

Out[58]: UserID User Gender Registered Cancelled TransactionID TransactionDate ProductI

0 1 Charles male 2012-12-21 NaT 4.0 2012-08-26 2.

2010-08-
1 2 Pedro male 2010-08-01 5.0 2013-06-06 4.
08

2016-06-
2 3 Caroline female 2012-10-23 2.0 2011-05-26 4.
07

3 4 Brielle female 2013-07-17 NaT NaN NaT Na

4 5 Benjamin male 2010-11-25 NaT NaN NaT Na

 

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 16/17
3/8/24, 6:10 PM Data-Visualization-05-03 - Jupyter Notebook

In [71]: users = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWrangling/


transactions = pd.read_csv('https://raw.githubusercontent.com/ben519/DataWra

In [72]: users['Registered'] = pd.to_datetime(users.Registered)


users['Cancelled'] = pd.to_datetime(users.Cancelled)
transactions['TransactionDate'] = pd.to_datetime(transactions.TransactionDat

In [77]: transactions['TransactionWeekday'] = pd.Categorical(transactions.Transaction




 

In [78]: transactions

Out[78]: TransactionID TransactionDate UserID ProductID Quantity TransactionWeekday

0 1 2010-08-21 7.0 2 1 Saturday

1 2 2011-05-26 3.0 4 1 Thursday

2 3 2011-06-16 3.0 3 1 Thursday

3 4 2012-08-26 1.0 2 3 Sunday

4 5 2013-06-06 2.0 4 1 Thursday

5 6 2013-12-23 2.0 5 6 Monday

6 7 2013-12-30 3.0 4 1 Monday

7 8 2014-04-24 NaN 2 3 Thursday

8 9 2015-04-24 7.0 4 3 Friday

9 10 2016-05-08 3.0 4 4 Sunday

In [ ]: ​

localhost:8888/notebooks/Data-Visualization-05-03.ipynb 17/17

You might also like