Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 308

Bar Plots, Histograms, and Distributions

import pandas as pd

import matplotlib.pyplot as plt

bike_sharing = pd.read_csv('day.csv')

bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])

plt.scatter(bike_sharing['workingday'], bike_sharing['casual'])

plt.title('Working Day Vs. Casual')


plt.show()

plt.scatter(bike_sharing['workingday'], bike_sharing['registered'])

plt.title('Working Day Vs. Registered')

plt.show()

2. Bar Plots
import matplotlib.pyplot as plt

working_days = ['Non-Working Day', 'Working Day']

registered_avg = [2959, 3978]

plt.bar(working_days, registered_avg)

plt.show()
3.Customizing Bar Plots
import pandas as pd

import matplotlib.pyplot as plt

bike_sharing = pd.read_csv('day.csv')

bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
weekday_averages = bike_sharing.groupby('weekday').mean()[['casual', 'registered']].reset_index() #
It's not essential to understand how this code works, we'll cover this in a later course

plt.bar(weekday_averages['weekday'], weekday_averages['registered'])

plt.xticks(ticks=[0, 1, 2, 3, 4, 5, 6],

labels=['Sunday', 'Monday', 'Tuesday', 'Wednesday',

'Thursday', 'Friday', 'Saturday'],

rotation=30)

plt.show()

4. Frequency Tables
import matplotlib.pyplot as plt

unique_values = [1, 2, 3, 4]

weather_2011 = [226, 124, 15, 0]

weather_2012 = [237, 123, 6, 0]

plt.bar(unique_values, weather_2011)

plt.xticks(ticks=[1,2,3,4])
plt.title('Weather Patterns: 2011')

plt.ylabel('Frequency')

plt.xlabel('Unique Values')

plt.show()

plt.bar(unique_values, weather_2012)

plt.xticks(ticks=[1,2,3,4])

plt.title('Weather Patterns: 2012')

plt.ylabel('Frequency')

plt.xlabel('Unique Values')

plt.show()

5.Grouped Frequency Tables


import pandas as pd

import matplotlib.pyplot as plt

bike_sharing = pd.read_csv('day.csv')

bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])

registered_freq = bike_sharing['registered'].value_counts(bins=10).sort_index()

casual_freq = bike_sharing['casual'].value_counts(bins=10).sort_index()
6.Histograms
import pandas as pd

import matplotlib.pyplot as plt

bike_sharing = pd.read_csv('day.csv')

bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])

plt.hist(bike_sharing['casual'])

plt.show()
7.The Normal Distribution
sentence_1 = True

sentence_2 = False

sentence_3 = True

sentence_4 = True

sentence_5 = False

8. The Uniform Distribution


sentence_1 = True

sentence_2 = False

sentence_3 = False

sentence_4 = False

9. Skewed Distributions
Bar Charts - Learn about this chart and tools to create it (datavizcatalogue.com)

Histogram - Learn about this chart and tools to create it (datavizcatalogue.com)

Frequency Distribution (mathsisfun.com)

Grouped Frequency Distribution (mathsisfun.com)

Normal Distribution (mathsisfun.com)

Pandas Visualizations and Grid Charts

1. Traffic Congestions in Sao Paulo


import pandas as pd

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic.head()

traffic.tail()

traffic.info()
3. Slowness in Traffic
import pandas as pd

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')


traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

import matplotlib.pyplot as plt

plt.hist(traffic['Slowness in traffic (%)'])

plt.show()

sentence_1 = True

sentence_2 = True

sentence_3 = False

4. Pandas Visualization Methods


import matplotlib.pyplot as plt

import pandas as pd

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

traffic['Slowness in traffic (%)'].plot.hist()

plt.title('Distribution of Slowness in traffic (%)')

plt.xlabel('Slowness in traffic (%)')

plt.show()
5. Frequency of Incidents
import pandas as pd

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

incidents = traffic.drop(['Hour (Coded)', 'Slowness in traffic (%)'],

axis=1)

incidents.sum().plot.barh()

plt.show()

sentence_1 = False

sentence_2 = True

sentence_3 = True
6. Correlations with Traffic Slowness
import pandas as pd

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

traffic.plot.scatter(x='Slowness in traffic (%)',

y='Lack of electricity')

plt.show()

traffic.plot.scatter(x='Slowness in traffic (%)',

y='Point of flooding')
plt.show()

traffic.plot.scatter(x='Slowness in traffic (%)',

y='Semaphore off')

plt.show()

7. Traffic Slowness Over 20%


import pandas as pd

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

slowness_20_or_more = traffic[traffic['Slowness in traffic (%)'] >= 20]

slowness_20_or_more = slowness_20_or_more.drop(['Slowness in traffic (%)',

'Hour (Coded)'], axis=1)

incident_frequencies = slowness_20_or_more.sum()

incident_frequencies.plot.barh()

plt.show()
8. How Traffic Slowness Change
import pandas as pd

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

traffic_per_day = {}

for i, day in zip(range(0, 135, 27), days):

each_day_traffic = traffic[i:i+27]

traffic_per_day[day] = each_day_traffic

for day in days:

traffic_per_day[day].plot.line(x='Hour (Coded)',

y='Slowness in traffic (%)')

plt.title(day)

plt.ylim([0, 25])

plt.show()
9. Comparing Graphs
import pandas as pd

import matplotlib.pyplot as plt


traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

traffic_per_day = {}

for i, day in zip(range(0, 135, 27), days):

each_day_traffic = traffic[i:i+27]

traffic_per_day[day] = each_day_traffic

for day in days:

plt.plot(traffic_per_day[day]['Hour (Coded)'],

traffic_per_day[day]['Slowness in traffic (%)'],

label=day)

plt.legend()

plt.show()

10. Grid Charts


import pandas as pd

import matplotlib.pyplot as plt


traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

plt.figure()

plt.subplot(3, 2, 1)

plt.subplot(3, 2, 2)

plt.subplot(3, 2, 6)

plt.subplot(3, 2, 3)

plt.subplot(3, 2, 4)

plt.subplot(3, 2, 5)

plt.show()

11. Grid Charts (II)


import pandas as pd

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

traffic_per_day = {}

for i, day in zip(range(0, 135, 27), days):

each_day_traffic = traffic[i:i+27]

traffic_per_day[day] = each_day_traffic

plt.figure(figsize=(10,12))

for i, day in zip(range(1,6), days):

plt.subplot(3, 2, i)

plt.plot(traffic_per_day[day]['Hour (Coded)'],

traffic_per_day[day]['Slowness in traffic (%)'])

plt.title(day)

plt.ylim([0,25])
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

traffic_per_day = {}

for i, day in zip(range(0, 135, 27), days):

each_day_traffic = traffic[i:i+27]

traffic_per_day[day] = each_day_traffic

plt.figure(figsize=(10,12))

for i, day in zip(range(1,6), days):

plt.subplot(3, 2, i)

plt.plot(traffic_per_day[day]['Hour (Coded)'],

traffic_per_day[day]['Slowness in traffic (%)'])

plt.title(day)

plt.ylim([0,25])

plt.subplot(3, 2, 6)

for day in days:

plt.plot(traffic_per_day[day]['Hour (Coded)'],

traffic_per_day[day]['Slowness in traffic (%)'],

label=day)

plt.ylim([0,25])

plt.legend()

plt.show()
Chart Visualization — pandas 1.4.1 documentation (pydata.org)

Small multiple - Wikipedia

Relational Plots and Multiple Variables


import pandas as pd
housing = pd.read_csv('housing.csv')
housing.head()
housing.tail()
housing.info()
2. Seaborn
import pandas as pd

housing = pd.read_csv('housing.csv')

import seaborn as sns

import matplotlib.pyplot as plt

sns.set_theme()

sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice')

plt.show()

correlation = 'positive'
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

housing = pd.read_csv('housing.csv')

# sns.set_theme()

# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice')

# plt.show()

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

sns.set_theme()

sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

hue='Overall Qual', palette='RdYlGn')


plt.show()

sentence_1 = True

sentence_2 = True
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

housing = pd.read_csv('housing.csv')

# sns.set_theme()

# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

# hue='Overall Qual', palette='RdYlGn')

# plt.show()

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

sns.set_theme()

sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

hue='Overall Qual', palette='RdYlGn',

size='Garage Area', sizes=(1,300))

plt.show()

sentence_1 = False

sentence_2 = True
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

housing = pd.read_csv('housing.csv')

# sns.set_theme()

# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

# hue='Overall Qual', palette='RdYlGn',

# size='Garage Area', sizes=(1,300))

# plt.show()
# *** *** *** *** *** *** *** *** *** ***

# Solution Code

sns.set_theme()

sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

hue='Overall Qual', palette='RdYlGn',

size='Garage Area', sizes=(1,300),

style='Rooms')

plt.show()

sentence_1 = False

sentence_2 = False

6.Variable Representation: Spatial Separation


import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

housing = pd.read_csv('housing.csv')

# sns.set_theme()

# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

# hue='Overall Qual', palette='RdYlGn',

# size='Garage Area', sizes=(1,300),

# style='Rooms')

# plt.show()

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

sns.set_theme()

sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',

hue='Overall Qual', palette='RdYlGn',


size='Garage Area', sizes=(1,300),

style='Rooms', col='Year')

plt.show()

sentence_1 = True

sentence_2 = True
Visualizing statistical relationships — seaborn 0.11.2 documentation (pydata.org)

Visualizing distributions of data — seaborn 0.11.2 documentation (pydata.org)

Plotting with categorical data — seaborn 0.11.2 documentation (pydata.org)

Building structured multi-plot grids — seaborn 0.11.2 documentation (pydata.org)

Guided Project: Finding Heavy Traffic Indicators on I-94

1. The I-94 Traffic Dataset


UCI Machine Learning Repository: Metro Interstate Traffic Volume Data Set

solutions/Mission524Solutions.ipynb at master · dataquestio/solutions · GitHub

2. Analyzing Traffic Volume


3. Traffic Volume: Day vs. Night
5. Time Indicators
8. Weather Indicators
9. Weather Types
Guided Project: Finding Heavy Traffic Indicators on I-94
import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

plt.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'])

plt.show()
3. Matplotlib Interfaces
Data Visualization With Matplotlib Course | Dataquest
4. The OO Interface
import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

fig, ax = plt.subplots()

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'])

plt.show()

5. Mobile-Friendly Proportions
import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

fig, ax = plt.subplots(figsize=(4.5, 6))

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'])

plt.show()

6. Maximizing Data-Ink
7. Erasing Non-Data Ink
import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

fig, ax = plt.subplots(figsize=(4.5, 6))

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'])

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax.tick_params(bottom=False, left=False)

plt.show()

8. Erasing Redundant Data-Ink


import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

# Initial Code

# fig, ax = plt.subplots(figsize=(4.5, 6))

# ax.barh(top20_deathtoll['Country_Other'],

# top20_deathtoll['Total_Deaths'])

# for location in ['left', 'right', 'top', 'bottom']:

# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

fig, ax = plt.subplots(figsize=(4.5, 6))

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'],

height=0.45)

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax.tick_params(bottom=False, left=False)

ax.set_xticks([0, 150000, 300000])

plt.show()

9. The Direction of Reading


import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

# Initial Code

# fig, ax = plt.subplots(figsize=(4.5, 6))

# ax.barh(top20_deathtoll['Country_Other'],

# top20_deathtoll['Total_Deaths'],

# height=0.45)
# for location in ['left', 'right', 'top', 'bottom']:

# ax.spines[location].set_visible(False)

# ax.tick_params(bottom=False, left=False)

# ax.set_xticks([0, 150000, 300000])

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

fig, ax = plt.subplots(figsize=(4.5, 6))

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'],

height=0.45, color='#af0b1e')

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax.set_xticks([0, 150000, 300000])

ax.xaxis.tick_top()

ax.tick_params(top=False, left=False)

ax.tick_params(axis='x', colors='grey')

plt.show()

10. Title and Subtitle


import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

fig, ax = plt.subplots(figsize=(4.5, 6))

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax.set_xticks([0, 150000, 300000])

ax.xaxis.tick_top()

ax.tick_params(top=False, left=False)

ax.tick_params(axis='x', colors='grey')

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax.text(x=-80000, y=23.5,

s='The Death Toll Worldwide Is 1.5M+',

weight='bold', size=17)

ax.text(x=-80000, y=22.5,

s='Top 20 countries by death toll (December 2020)',

size=12)

plt.show()
import pandas as pd

import matplotlib.pyplot as plt

top20_deathtoll = pd.read_csv('top20_deathtoll.csv')

fig, ax = plt.subplots(figsize=(4.5, 6))

ax.barh(top20_deathtoll['Country_Other'],

top20_deathtoll['Total_Deaths'],

height=0.45, color='#af0b1e')

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax.set_xticks([0, 150000, 300000])

ax.xaxis.tick_top()

ax.tick_params(top=False, left=False)

ax.tick_params(axis='x', colors='grey')

ax.text(x=-80000, y=23.5,

s='The Death Toll Worldwide Is 1.5M+',

weight='bold', size=17)

ax.text(x=-80000, y=22.5,

s='Top 20 countries by death toll (December 2020)',

size=12)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax.set_xticklabels(['0', '150,000', '300,000'])

ax.set_yticklabels([]) # an empty list removes the labels

country_names = top20_deathtoll['Country_Other']

for i, country in zip(range(20), country_names):

ax.text(x=-80000, y=i-0.15, s=country)

ax.axvline(x=150000, ymin=0.045, c='grey',


alpha=0.5)

plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)

The Lifecycle of a Plot — Matplotlib 3.5.1 documenta

Examples — Matplotlib 3.5.1 documentation n


Design for an Audience

1. Data Stories
2. Grid Charts in Matplotlib
import pandas as pd

import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,

figsize=(6,8))

ax1.plot(death_toll['Month'], death_toll['New_deaths'])

ax2.plot(death_toll['Month'], death_toll['New_deaths'])

ax3.plot(death_toll['Month'], death_toll['New_deaths'])

ax4.plot(death_toll['Month'], death_toll['New_deaths'])

plt.show()

3. Faster Workflow
import pandas as pd

import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,

figsize=(6,8))

axes = [ax1, ax2, ax3, ax4]

for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'])

ax.set_yticklabels([])

ax.set_xticklabels([])

ax.tick_params(bottom=0, left=0)

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

plt.show()

4. Modifying the Line Plots


import pandas as pd

import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,

figsize=(6,8))

axes = [ax1, ax2, ax3, ax4]


for ax in axes:

ax.plot(death_toll['Month'], death_toll['New_deaths'],

color='#af0b1e', alpha=0.1)

ax.set_yticklabels([])

ax.set_xticklabels([])

ax.tick_params(bottom=0, left=0)

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],

color='#af0b1e', linewidth=2.5)

ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],

color='#af0b1e', linewidth=2.5)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],

color='#af0b1e', linewidth=2.5)

ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],

color='#af0b1e', linewidth=2.5)

plt.show()

5. Adding Structural Elements


import pandas as pd
import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,


figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)

ax1.text(0.5, -80, '0', alpha=0.5)


ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)

ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',


weight='bold', rotation=3)
# *** *** *** *** *** *** *** *** *** ***

# Solution Code
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
plt.show()
6. Title and Subtitle
WHO Coronavirus (COVID-19) Dashboard | WHO Coronavirus (COVID-19) Dashboard With
Vaccination Data

import pandas as pd
import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,

figsize=(6,8))

axes = [ax1, ax2, ax3, ax4]

for ax in axes:

ax.plot(death_toll['Month'], death_toll['New_deaths'],

color='#af0b1e', alpha=0.1)

ax.set_yticklabels([])

ax.set_xticklabels([])

ax.tick_params(bottom=0, left=0)

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],

color='#af0b1e', linewidth=2.5)

ax1.text(0.5, -80, '0', alpha=0.5)

ax1.text(3.5, 2000, '1,844', alpha=0.5)

ax1.text(11.5, 2400, '2,247', alpha=0.5)

ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',

weight='bold', rotation=3)

ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],

color='#af0b1e', linewidth=2.5)

ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')

ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],

color='#af0b1e', linewidth=2.5)

ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')


ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],

color='#af0b1e', linewidth=2.5)

ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',

rotation=45)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax1.text(0.5, 3500, 'The virus kills 851 people each day',

size=14, weight='bold')

ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',

size=12)

plt.show()

7. Adding a Progress Bar


import pandas as pd

import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,

figsize=(6,8))

axes = [ax1, ax2, ax3, ax4]

for ax in axes:

ax.plot(death_toll['Month'], death_toll['New_deaths'],

color='#af0b1e', alpha=0.1)

ax.set_yticklabels([])

ax.set_xticklabels([])

ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],

color='#af0b1e', linewidth=2.5)

ax1.text(0.5, -80, '0', alpha=0.5)

ax1.text(3.5, 2000, '1,844', alpha=0.5)

ax1.text(11.5, 2400, '2,247', alpha=0.5)

ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',

weight='bold', rotation=3)

ax1.text(0.5, 3500, 'The virus kills 851 people each day',

size=14, weight='bold')

ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',

size=12)

ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],

color='#af0b1e', linewidth=2.5)

ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')

ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],

color='#af0b1e', linewidth=2.5)

ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')

ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],

color='#af0b1e', linewidth=2.5)

ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',

rotation=45)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

for ax in axes:
ax.axhline(y=1600, xmin=0.5, xmax=0.8,

linewidth=6, color='#af0b1e',

alpha=0.1)

'''

Alternatively, you can integrate this code

inside the first for loop above.

'''

plt.show()

8. Completing the Progress Bar


import pandas as pd

import matplotlib.pyplot as plt

death_toll = pd.read_csv('covid_avg_deaths.csv')

deaths = [2398, 126203, 227178, 295406]

proportions = [round(death/295406, 2) for death in deaths]

xmax_vals = [round(0.5 + proportion * 0.3, 3) for proportion in proportions]

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,

figsize=(6,8))

axes = [ax1, ax2, ax3, ax4]

for ax in axes:

ax.plot(death_toll['Month'], death_toll['New_deaths'],

color='#af0b1e', alpha=0.1)

ax.set_yticklabels([])

ax.set_xticklabels([])

ax.tick_params(bottom=0, left=0)

for location in ['left', 'right', 'top', 'bottom']:

ax.spines[location].set_visible(False)

ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],

color='#af0b1e', linewidth=2.5)
ax1.text(0.5, -80, '0', alpha=0.5)

ax1.text(3.5, 2000, '1,844', alpha=0.5)

ax1.text(11.5, 2400, '2,247', alpha=0.5)

ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',

weight='bold', rotation=3)

ax1.text(0.5, 3500, 'The virus kills 851 people each day',

size=14, weight='bold')

ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',

size=12)

ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],

color='#af0b1e', linewidth=2.5)

ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')

ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],

color='#af0b1e', linewidth=2.5)

ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')

ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],

color='#af0b1e', linewidth=2.5)

ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',

rotation=45)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

for ax, xmax, death in zip(axes, xmax_vals, deaths):

ax.axhline(y=1600, xmin=0.5, xmax=0.8,

linewidth=6, color='#af0b1e',

alpha=0.1)
ax.axhline(y=1600, xmin=0.5, xmax=xmax,

linewidth=6, color='#af0b1e')

ax.text(7.5, 1850, format(death, ','),

color='#af0b1e', weight='bold')

plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)

Examples — Matplotlib 3.5.1 documentation

Is Your Data Story Actually A Story? | by Joshua Smith | Nightingale | Medium

Gestalt Principles and Pre-Attentive Attributes

1. Gestalt Principles

Link:

Gestalt Principles And Pre-attentive Attributes — Gestalt Principles | Dataquest!!!!!!!!!!!!


2. Proximity
sentence_1 = True

sentence_2 = True

sentence_3 = False

3. Similarity
sentence_1 = False # It's similarity of color, not shape

sentence_2 = True

sentence_3 = True

4. Enclosure
sentence_1 = True

sentence_2 = True

sentence_3 = True

5. Connection
6. Visual Hierarchy
sentence_1 = False # Enclosure is stronger.

sentence_2 = True

sentence_3 = False # Enclosure is stronger.

sentence_4 = False # The human perception is non-random.

7. Pre-Attentive Attributes
sentence_1 = True

sentence_2 = False

sentence_3 = True
Pre-attentive processing - Wikipedia

Gestalt psychology - Wikipedia

Matplotlib Styles: FiveThirtyEight Case Study


import matplotlib.style as style
style.use('ggplot')

plt.plot([2, 4, 6], [10, 15, 5])

plt.show()

style.use('default')

plt.plot([2, 4, 6], [10, 15, 5])

plt.show()

2. Wine Quality Dataset


import pandas as pd

red_wine = pd.read_csv('winequality-red.csv', sep=';')

red_corr = red_wine.corr()['quality'][:-1]

white_wine = pd.read_csv('winequality-white.csv', sep=';')

white_corr = white_wine.corr()['quality'][:-1]

print(white_corr)

3. FiveThirtyEight Style
# Initial Code

style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize=(9, 5))

# ax.barh(white_corr.index, white_corr, left=2)

# ax.barh(red_corr.index, red_corr)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax.barh(white_corr.index, white_corr, left=2, height=0.5)

ax.barh(red_corr.index, red_corr, height=0.5)

ax.grid(b=False)

ax.set_yticklabels([])
ax.set_xticklabels([])

plt.show()

4. Adding Y-tick Labels


style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize=(9, 5))

ax.barh(white_corr.index, white_corr, left=2, height=0.5)

ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)

ax.grid(b=False)

ax.set_yticklabels([])

ax.set_xticklabels([])

x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,

'Density': 0.80, 'Total Sulfur Dioxide': 0.59,

'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,

'Residual Sugar': 0.67, 'Citric Acid': 0.76,

'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}

y_coord = 9.8

for y_label, x_coord in x_coords.items():

ax.text(x_coord, y_coord, y_label)

y_coord -= 1

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

plt.show()
5. Adding X-tick Labels
style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize=(9, 5))

ax.barh(white_corr.index, white_corr, left=2, height=0.5)

ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)

ax.grid(b=False)

ax.set_yticklabels([])

ax.set_xticklabels([])

x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,

'Density': 0.80, 'Total Sulfur Dioxide': 0.59,


'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,

'Residual Sugar': 0.67, 'Citric Acid': 0.76,

'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}

y_coord = 9.8

for y_label, x_coord in x_coords.items():

ax.text(x_coord, y_coord, y_label)

y_coord -= 1

ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,

xmin=0.01, xmax=0.32)

ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',

color='grey', alpha=0.5)

ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,

xmin=0.67, xmax=0.98)

ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',

color='grey', alpha=0.5)

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax.axhline(11, color='grey', linewidth=1, alpha=0.5,

xmin=0.01, xmax=0.32)

ax.text(-0.33, 11.2, 'RED WINE', weight='bold')

ax.axhline(11, color='grey', linewidth=1, alpha=0.5,

xmin=0.67, xmax=0.98)

ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')


plt.show()

6. Adding a Signature

style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize=(9, 5))

ax.barh(white_corr.index, white_corr, left=2, height=0.5)

ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)

ax.grid(b=False)

ax.set_yticklabels([])

ax.set_xticklabels([])

x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,

'Density': 0.80, 'Total Sulfur Dioxide': 0.59,

'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,

'Residual Sugar': 0.67, 'Citric Acid': 0.76,


'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}

y_coord = 9.8

for y_label, x_coord in x_coords.items():

ax.text(x_coord, y_coord, y_label)

y_coord -= 1

ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,

xmin=0.01, xmax=0.32)

ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',

color='grey', alpha=0.5)

ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,

xmin=0.67, xmax=0.98)

ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',

color='grey', alpha=0.5)

ax.axhline(11, color='grey', linewidth=1, alpha=0.5,

xmin=0.01, xmax=0.32)

ax.text(-0.33, 11.2, 'RED WINE', weight='bold')

ax.axhline(11, color='grey', linewidth=1, alpha=0.5,

xmin=0.67, xmax=0.98)

ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')

ax.text(-0.7, -2.9,

'©DATAQUEST' + ' '*94 + 'Source: P. Cortez et al.',

color = '#f0f0f0', backgroundcolor = '#4d4d4d',

size=12)
# *** *** *** *** *** *** *** *** *** ***

# Solution Code

ax.text(-0.7, 13.5,

'Wine Quality Most Strongly Correlated With Alcohol Level',

size=17, weight='bold')

ax.text(-0.7, 12.7,

'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')

plt.show()

7. Coloring Bars Differently


positive_white = white_corr >= 0

color_map_white = positive_white.map({True:'#33A1C9', False:'#ffae42'})

style.use('fivethirtyeight')

fig, ax = plt.subplots(figsize=(9, 5))

ax.barh(white_corr.index, white_corr, left=2, height=0.5,

color=color_map_white)

#ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)

ax.grid(b=False)

ax.set_yticklabels([])

ax.set_xticklabels([])

x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,


'Density': 0.80, 'Total Sulfur Dioxide': 0.59,

'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,

'Residual Sugar': 0.67, 'Citric Acid': 0.76,

'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}

y_coord = 9.8

for y_label, x_coord in x_coords.items():

ax.text(x_coord, y_coord, y_label)

y_coord -= 1

ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axvline(1.45, c='grey', alpha=0.1, linewidth=1,

ymin=0.1, ymax=0.9)

ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,

xmin=0.01, xmax=0.32)

ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',

color='grey', alpha=0.5)

ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,

xmin=0.67, xmax=0.98)

ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',

color='grey', alpha=0.5)

ax.axhline(11, color='grey', linewidth=1, alpha=0.5,

xmin=0.01, xmax=0.32)

ax.text(-0.33, 11.2, 'RED WINE', weight='bold')

ax.axhline(11, color='grey', linewidth=1, alpha=0.5,

xmin=0.67, xmax=0.98)
ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')

ax.text(-0.7, -2.9, '©DATAQUEST' + ' '*92 + 'Source: P. Cortez et al.',

color = '#f0f0f0', backgroundcolor = '#4d4d4d',

size=12)

ax.text(-0.7, 13.5,

'Wine Quality Most Strongly Correlated With Alcohol Level',

size=17, weight='bold')

ax.text(-0.7, 12.7,

'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')

# *** *** *** *** *** *** *** *** *** ***

# Solution Code

positive_red = red_corr >= 0

color_map_red = positive_red.map({True:'#33A1C9', False:'#ffae42'})

ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1,

color=color_map_red)

plt.show()
How to Generate FiveThirtyEight Graphs in Python – Dataquest

Guided Project: Storytelling Data Visualization on Exchange Rates


An In-Depth Style Guide for Data Science Projects – Dataquest

Latest 529 topics - Dataquest Community


Data Aggregation

World Happiness Report | Kaggle


happiness2015 = pd.read_csv("World_Happiness_2015.csv")

first_5 = happiness2015.head()

happiness2015.info()
mean_happiness = {}

regions = happiness2015['Region'].unique()
for r in regions:

region_group = happiness2015[happiness2015['Region'] == r]

region_mean = region_group['Happiness Score'].mean()

mean_happiness[r] = region_mean
grouped = happiness2015.groupby('Region')

aus_nz = grouped.get_group('Australia and New Zealand')


grouped = happiness2015.groupby('Region')

north_america = happiness2015.iloc[[4,14]]

na_group = grouped.get_group('North America')

equal = north_america == na_group


grouped = happiness2015.groupby('Region')

means = grouped.mean()
grouped = happiness2015.groupby('Region')

happy_grouped = grouped['Happiness Score']

happy_mean = happy_grouped.mean()
import numpy as np

grouped = happiness2015.groupby('Region')

happy_grouped = grouped['Happiness Score']

def dif(group):

return (group.max() - group.mean())

happy_mean_max = happy_grouped.agg([np.mean, np.max])

mean_max_dif = happy_grouped.agg(dif)
happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()

happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()

print(happiness_means)
pv_happiness = happiness2015.pivot_table(values='Happiness Score', index='Region',
aggfunc=np.mean, margins=True)

pv_happiness.plot(kind='barh', xlim=(0,10), title='Mean Happiness Scores by Region', legend=False)

world_mean_happiness = happiness2015['Happiness Score'].mean()


grouped = happiness2015.groupby('Region')[['Happiness Score','Family']]

happy_family_stats = grouped.agg([np.min, np.max, np.mean])

pv_happy_family_stats = happiness2015.pivot_table(['Happiness Score', 'Family'], 'Region',


aggfunc=[np.min, np.max, np.mean], margins=True)
Combining Data Using Pandas
import pandas as pd

happiness2015 = pd.read_csv("World_Happiness_2015.csv")

happiness2016 = pd.read_csv("World_Happiness_2016.csv")

happiness2017 = pd.read_csv("World_Happiness_2017.csv")

happiness2015['Year'] = 2015

happiness2016['Year'] = 2016

happiness2017['Year'] = 2017

Combining Data Using Pandas


Merge, join, concatenate and compare — pandas 1.4.1 documentation (pydata.org)

Transforming Data with Pandas

mapping = {'Economy (GDP per Capita)': 'Economy', 'Health (Life Expectancy)': 'Health', 'Trust
(Government Corruption)': 'Trust' }

happiness2015 = happiness2015.rename(mapping, axis = 1)


def label(element):

if element > 1:

return 'High'

else:

return 'Low'

economy_impact_map = happiness2015['Economy'].map(label)

economy_impact_apply = happiness2015['Economy'].apply(label)

equal = economy_impact_map.equals(economy_impact_apply)

def label(element):

if element > 1:

return 'High'

else:

return 'Low'

economy_impact_apply = happiness2015['Economy'].apply(label)

def label(element, x):

if element > x:
return 'High'

else:

return 'Low'

economy_impact_apply = happiness2015['Economy'].apply(label, x = .8)

def label(element):

if element > 1:

return 'High'

else:

return 'Low'

economy_apply = happiness2015['Economy'].apply(label)

factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity']

factors_impact = happiness2015[factors].applymap(label)

def v_counts(col):

num = col.value_counts()

den = col.size

return num/den

v_counts_pct = factors_impact.apply(v_counts)
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']

def percentages(col):

div = col/happiness2015['Happiness Score']

return div * 100

factor_percentages = happiness2015[factors].apply(percentages)

main_cols = ['Country', 'Region', 'Happiness Rank', 'Happiness Score']

factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']

melt = pd.melt(happiness2015, id_vars = main_cols, value_vars = factors)

melt['Percentage'] = round(melt['value']/melt['Happiness Score'] * 100, 2)

melt = pd.melt(happiness2015, id_vars = ['Country', 'Region', 'Happiness Rank', 'Happiness Score'],


value_vars= ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual'])

melt['Percentage'] = round(melt['value']/melt['Happiness Score'] * 100, 2)


pv_melt = melt.pivot_table(index='variable', values='value')

pv_melt.plot(kind='pie', y='value', legend=False)


Tidy Data | Journal of Statistical Software (jstatsoft.org)

Working with Strings In Pandas


world_dev = pd.read_csv("World_dev.csv")

col_renaming = {'SourceOfMostRecentIncomeAndExpenditureData': 'IESurvey'}

merged = pd.merge(left=happiness2015, right=world_dev, how='left', left_on='Country',


right_on='ShortName')

merged = merged.rename(col_renaming, axis=1)

def extract_last_word(element):

return str(element).split()[-1]

merged['Currency Apply'] = merged['CurrencyUnit'].apply(extract_last_word)


merged['Currency Vectorized'] = merged['CurrencyUnit'].str.split().str.get(-1)

print(merged['Currency Vectorized'].head())

lengths = merged['CurrencyUnit'].str.len()

value_counts = lengths.value_counts(dropna=False)

pattern = r"[Nn]ational accounts"

national_accounts = merged['SpecialNotes'].str.contains(pattern)

print(national_accounts.head())
pattern = r"[Nn]ational accounts"

national_accounts = merged['SpecialNotes'].str.contains(r"[Nn]ational accounts", na=False)

merged_national_accounts = merged[national_accounts]

print(merged_national_accounts.head())

pattern =r"()"

pattern = r"([1-2][0-9]{3})"

years = merged['SpecialNotes'].str.extract(pattern)
pattern = r"(?P<Years>[1-2][0-9]{3})"

years = merged['IESurvey'].str.extractall(pattern)

value_counts = years['Years'].value_counts()

print(value_counts)
pattern = r"(?P<First_Year>[1-2][0-9]{3})/?(?P<Second_Year>[0-9]{2})?"

years = merged['IESurvey'].str.extractall(pattern)

first_two_year = years['First_Year'].str[0:2]

years['Second_Year'] = first_two_year + years['Second_Year']


merged['IncomeGroup'] = merged['IncomeGroup'].str.replace(' income', '').str.replace(':',
'').str.upper()

pv_incomes = merged.pivot_table(values='Happiness Score', index='IncomeGroup')

pv_incomes.plot(kind='bar', rot=30, ylim=(0,10))

plt.show()
Working with text data — pandas 1.4.1 documentation (pydata.org)

6.2. re — Regular expression operations — Python 3.4.10 documentation

Working With Missing And Duplicate Dana

shape_2015 = happiness2015.shape
shape_2016 = happiness2016.shape

shape_2017 = happiness2017.shape

missing_2016 = happiness2016.isnull().sum()

missing_2017 = happiness2017.isnull().sum()

happiness2017.columns = happiness2017.columns.str.replace('.', ' ').str.replace('\s+', '


').str.strip().str.upper()

happiness2015.columns = happiness2015.columns.str.replace('(', '').str.replace(')',


'').str.strip().str.upper()

happiness2016.columns = happiness2016.columns.str.replace('(', '').str.replace(')',


'').str.strip().str.upper()

combined = pd.concat([happiness2015, happiness2016, happiness2017], ignore_index=True)

missing = combined.isnull().sum()
regions_2017 = combined[combined['YEAR']==2017]['REGION']

missing = regions_2017.isnull().sum()

combined = pd.merge(left=combined, right=regions, on='COUNTRY', how='left')

combined = combined.drop('REGION_x', axis = 1)

missing = combined.isnull().sum()
combined['COUNTRY'] = combined['COUNTRY'].str.upper()

dups = combined.duplicated(['COUNTRY', 'YEAR'])

print(combined[dups])

combined['COUNTRY'] = combined['COUNTRY'].str.upper()

combined = combined.drop_duplicates(['COUNTRY', 'YEAR'])

columns_to_drop = ['LOWER CONFIDENCE INTERVAL', 'STANDARD ERROR', 'UPPER CONFIDENCE


INTERVAL', 'WHISKER HIGH', 'WHISKER LOW']

combined = combined.drop(columns_to_drop, axis = 1)

missing = combined.isnull().sum()
combined = combined.dropna(thresh=159, axis=1)

missing = combined.isnull().sum()

happiness_mean = combined['HAPPINESS SCORE'].mean()

print(happiness_mean)

combined['HAPPINESS SCORE UPDATED'] = combined['HAPPINESS SCORE'].fillna(happiness_mean)

print(combined['HAPPINESS SCORE UPDATED'].mean())

combined = combined.dropna()

missing = combined.isnull().sum()
Working with missing data — pandas 1.4.1 documentation (pydata.org)
Regular Expression Basics

import re

titles = hn["title"].tolist()

python_mentions = 0

pattern = "[Pp]ython"
for t in titles:

if re.search(pattern, t):

python_mentions += 1
re — Regular expression operations — Python 3.10.2 documentation

RegExr: Learn, Build, & Test RegEx

Advanced Regular Expressions

import pandas as pd

import re

hn = pd.read_csv("hacker_news.csv")

titles = hn['title']

sql_pattern = r"SQL"

sql_counts = titles.str.contains(sql_pattern, flags=re.I).sum()


hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()

hn_sql["flavor"] = hn_sql["title"].str.extract(r"(\w+SQL)", re.I, expand=False)

hn_sql["flavor"] = hn_sql["flavor"].str.lower()

sql_pivot = hn_sql.pivot_table(index="flavor",values="num_comments", aggfunc='mean')

pattern = r"[Pp]ython ([\d\.]+)"

py_versions = titles.str.extract(pattern, expand=False)

py_versions_freq = dict(py_versions.value_counts())
def first_10_matches(pattern):

"""

Return the first 10 story titles that match

the provided regular expression

"""

all_matches = titles[titles.str.contains(pattern)]

first_10 = all_matches.head(10)

return first_10

# pattern = r"\b[Cc]\b"

pattern = r"\b[Cc]\b[^.+]"

first_ten = first_10_matches(pattern)

pattern = r"(?<!Series\s)\b[Cc]\b((?![+.])|\.$)"

c_mentions = titles.str.contains(pattern).sum()
pattern = r"\b(\w+)\s\1\b"

repeated_words = titles[titles.str.contains(pattern)]

email_variations = pd.Series(['email', 'Email', 'e Mail',

'e mail', 'E-mail', 'e-mail',

'eMail', 'E-Mail', 'EMAIL'])

pattern = r"\be[-\s]?mail"

email_uniform = email_variations.str.replace(pattern, "email", flags=re.I)

titles_clean = titles.str.replace(pattern, "email", flags=re.I)


test_urls = pd.Series([

'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',

'http://www.interactivedynamicvideo.com/',

'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',

'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',

'HTTPS://github.com/keppel/pinn',

'Http://phys.org/news/2015-09-scale-solar-youve.html',

'https://iot.seeed.cc',

'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',

'http://beta.crowdfireapp.com/?beta=agnipath',

'https://www.valid.ly?param',

'http://css-cursor.techstream.org'

])

pattern = r"https?://([\w\-\.]+)"

test_urls_clean = test_urls.str.extract(pattern, flags=re.I, expand=False)

domains = hn['url'].str.extract(pattern, flags=re.I, expand=False)

top_domains = domains.value_counts().head(5)
# `test_urls` is available from the previous screen

pattern = r"(https?)://([\w\.\-]+)/?(.*)"

test_url_parts = test_urls.str.extract(pattern, flags=re.I)

url_parts = hn['url'].str.extract(pattern, flags=re.I)

# pattern = r"(https?)://([\w\.\-]+)/?(.*)"

pattern = r"(?P<protocol>https?)://(?P<domain>[\w\.\-]+)/?(?P<path>.*)"

url_parts = hn['url'].str.extract(pattern, flags=re.I)


re — Regular expression operations — Python 3.10.2 documentation

RegExr: Learn, Build, & Test RegEx


List Comprehensions and Lambda Functions
world_cup_str = """

"team_1": "France",

"team_2": "Croatia",

"game_type": "Final",

"score" : [4, 2]

},

"team_1": "Belgium",

"team_2": "England",

"game_type": "3rd/4th Playoff",

"score" : [2, 0]

]
"""

import json

world_cup_obj = json.loads(world_cup_str)
JSON

json — JSON encoder and decoder — Python 3.7.12 documentation

5. Data Structures — Python 3.10.2 documentation

4. More Control Flow Tools — Python 3.10.2 documentation

Working with Missing Dana

You might also like