Data Analyst in Python 2

Bar Plots, Histograms, and Distributions
import pandas as pd
import matplotlib.pyplot as plt
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
plt.scatter(bike_sharing['workingday'], bike_sharing['casual'])
plt.title('Working Day Vs. Casual')

plt.show()
plt.scatter(bike_sharing['workingday'], bike_sharing['registered'])
plt.title('Working Day Vs. Registered')
plt.show()
2. Bar Plots
working_days = ['Non-Working Day', 'Working Day']
registered_avg = [2959, 3978]
plt.bar(working_days, registered_avg)
plt.show()
3.Customizing Bar Plots
import pandas as pd
weekday_averages = bike_sharing.groupby('weekday').mean()[['casual', 'registered']].reset_index() #
It's not essential to understand how this code works, we'll cover this in a later course
plt.bar(weekday_averages['weekday'], weekday_averages['registered'])
plt.xticks(ticks=[0, 1, 2, 3, 4, 5, 6],
labels=['Sunday', 'Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday'],
rotation=30)
plt.show()
4. Frequency Tables
unique_values = [1, 2, 3, 4]
weather_2011 = [226, 124, 15, 0]
weather_2012 = [237, 123, 6, 0]
plt.bar(unique_values, weather_2011)
plt.xticks(ticks=[1,2,3,4])
plt.title('Weather Patterns: 2011')
plt.ylabel('Frequency')
plt.xlabel('Unique Values')
plt.show()
plt.bar(unique_values, weather_2012)
plt.xticks(ticks=[1,2,3,4])
plt.title('Weather Patterns: 2012')
plt.ylabel('Frequency')
plt.xlabel('Unique Values')
plt.show()
5.Grouped Frequency Tables

import pandas as pd
registered_freq = bike_sharing['registered'].value_counts(bins=10).sort_index()
casual_freq = bike_sharing['casual'].value_counts(bins=10).sort_index()
6.Histograms
import pandas as pd
plt.hist(bike_sharing['casual'])
plt.show()
7.The Normal Distribution
sentence_1 = True
sentence_2 = False
sentence_3 = True
sentence_4 = True
sentence_5 = False
8. The Uniform Distribution

sentence_1 = True
sentence_2 = False
sentence_3 = False
sentence_4 = False
9. Skewed Distributions
Bar Charts - Learn about this chart and tools to create it (datavizcatalogue.com)
Histogram - Learn about this chart and tools to create it (datavizcatalogue.com)
Frequency Distribution (mathsisfun.com)
Grouped Frequency Distribution (mathsisfun.com)
Normal Distribution (mathsisfun.com)
Pandas Visualizations and Grid Charts
1. Traffic Congestions in Sao Paulo

import pandas as pd
traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')
traffic.head()
traffic.tail()
traffic.info()
3. Slowness in Traffic
import pandas as pd
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)
plt.hist(traffic['Slowness in traffic (%)'])
plt.show()
sentence_1 = True
sentence_2 = True
sentence_3 = False
4. Pandas Visualization Methods

import pandas as pd
traffic['Slowness in traffic (%)'].plot.hist()
plt.title('Distribution of Slowness in traffic (%)')
plt.xlabel('Slowness in traffic (%)')
plt.show()
5. Frequency of Incidents
import pandas as pd
incidents = traffic.drop(['Hour (Coded)', 'Slowness in traffic (%)'],
axis=1)
incidents.sum().plot.barh()
plt.show()
sentence_1 = False
sentence_2 = True
sentence_3 = True
6. Correlations with Traffic Slowness
import pandas as pd
traffic.plot.scatter(x='Slowness in traffic (%)',
y='Lack of electricity')
plt.show()
y='Point of flooding')
plt.show()
y='Semaphore off')
plt.show()
7. Traffic Slowness Over 20%

import pandas as pd
slowness_20_or_more = traffic[traffic['Slowness in traffic (%)'] >= 20]
slowness_20_or_more = slowness_20_or_more.drop(['Slowness in traffic (%)',
'Hour (Coded)'], axis=1)
incident_frequencies = slowness_20_or_more.sum()
incident_frequencies.plot.barh()
plt.show()
8. How Traffic Slowness Change
import pandas as pd
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days):
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
for day in days:
traffic_per_day[day].plot.line(x='Hour (Coded)',
y='Slowness in traffic (%)')
plt.title(day)
plt.ylim([0, 25])
plt.show()
9. Comparing Graphs
import pandas as pd

for day in days:
plt.plot(traffic_per_day[day]['Hour (Coded)'],
traffic_per_day[day]['Slowness in traffic (%)'],
label=day)
plt.legend()
plt.show()
10. Grid Charts

import pandas as pd

plt.figure()
plt.subplot(3, 2, 1)
plt.show()
11. Grid Charts (II)

import pandas as pd
plt.figure(figsize=(10,12))
for i, day in zip(range(1,6), days):
plt.subplot(3, 2, i)
traffic_per_day[day]['Slowness in traffic (%)'])
plt.title(day)
plt.ylim([0,25])
plt.show()
import pandas as pd
plt.figure(figsize=(10,12))
for i, day in zip(range(1,6), days):
plt.subplot(3, 2, i)
traffic_per_day[day]['Slowness in traffic (%)'])
plt.title(day)
plt.ylim([0,25])
for day in days:
traffic_per_day[day]['Slowness in traffic (%)'],
label=day)
plt.ylim([0,25])
plt.legend()
plt.show()
Chart Visualization — pandas 1.4.1 documentation (pydata.org)
Small multiple - Wikipedia
Relational Plots and Multiple Variables

import pandas as pd
housing = pd.read_csv('housing.csv')
housing.head()
housing.tail()
housing.info()
2. Seaborn
import pandas as pd
import seaborn as sns
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice')
plt.show()
correlation = 'positive'
import pandas as pd
# sns.set_theme()
# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
hue='Overall Qual', palette='RdYlGn')

plt.show()
sentence_1 = True
sentence_2 = True
import pandas as pd
# sns.set_theme()
# sns.relplot(data=housing, x='Gr Liv Area', y='SalePrice',
# hue='Overall Qual', palette='RdYlGn')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
hue='Overall Qual', palette='RdYlGn',
size='Garage Area', sizes=(1,300))
plt.show()
sentence_1 = False
sentence_2 = True
import pandas as pd
# sns.set_theme()
# hue='Overall Qual', palette='RdYlGn',
# size='Garage Area', sizes=(1,300))
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
size='Garage Area', sizes=(1,300),
style='Rooms')
plt.show()
sentence_1 = False
sentence_2 = False
6.Variable Representation: Spatial Separation

import pandas as pd
# sns.set_theme()
# hue='Overall Qual', palette='RdYlGn',
# size='Garage Area', sizes=(1,300),
# style='Rooms')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()

size='Garage Area', sizes=(1,300),
style='Rooms', col='Year')
plt.show()
sentence_1 = True
sentence_2 = True
Visualizing statistical relationships — seaborn 0.11.2 documentation (pydata.org)
Visualizing distributions of data — seaborn 0.11.2 documentation (pydata.org)
Plotting with categorical data — seaborn 0.11.2 documentation (pydata.org)
Building structured multi-plot grids — seaborn 0.11.2 documentation (pydata.org)
Guided Project: Finding Heavy Traffic Indicators on I-94
1. The I-94 Traffic Dataset

UCI Machine Learning Repository: Metro Interstate Traffic Volume Data Set
solutions/Mission524Solutions.ipynb at master · dataquestio/solutions · GitHub
2. Analyzing Traffic Volume

3. Traffic Volume: Day vs. Night
5. Time Indicators
8. Weather Indicators
9. Weather Types
Guided Project: Finding Heavy Traffic Indicators on I-94
import pandas as pd
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
plt.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
3. Matplotlib Interfaces
Data Visualization With Matplotlib Course | Dataquest
4. The OO Interface
import pandas as pd
fig, ax = plt.subplots()
ax.barh(top20_deathtoll['Country_Other'],
plt.show()
5. Mobile-Friendly Proportions
import pandas as pd
fig, ax = plt.subplots(figsize=(4.5, 6))
plt.show()
6. Maximizing Data-Ink
7. Erasing Non-Data Ink
import pandas as pd
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax.tick_params(bottom=False, left=False)
plt.show()
8. Erasing Redundant Data-Ink

import pandas as pd
# Initial Code
# fig, ax = plt.subplots(figsize=(4.5, 6))
# ax.barh(top20_deathtoll['Country_Other'],
# top20_deathtoll['Total_Deaths'])
# for location in ['left', 'right', 'top', 'bottom']:
# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
top20_deathtoll['Total_Deaths'],
height=0.45)
ax.tick_params(bottom=False, left=False)
ax.set_xticks([0, 150000, 300000])
plt.show()
9. The Direction of Reading

import pandas as pd
# Initial Code
# fig, ax = plt.subplots(figsize=(4.5, 6))
# ax.barh(top20_deathtoll['Country_Other'],
# top20_deathtoll['Total_Deaths'],
# height=0.45)
# for location in ['left', 'right', 'top', 'bottom']:
# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)
# ax.set_xticks([0, 150000, 300000])
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
height=0.45, color='#af0b1e')
ax.set_xticks([0, 150000, 300000])
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
plt.show()
10. Title and Subtitle

import pandas as pd
ax.set_xticks([0, 150000, 300000])
ax.xaxis.tick_top()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.text(x=-80000, y=23.5,
s='The Death Toll Worldwide Is 1.5M+',
weight='bold', size=17)
ax.text(x=-80000, y=22.5,
s='Top 20 countries by death toll (December 2020)',
size=12)
plt.show()
import pandas as pd
ax.set_xticks([0, 150000, 300000])
ax.xaxis.tick_top()
ax.text(x=-80000, y=23.5,
s='The Death Toll Worldwide Is 1.5M+',
weight='bold', size=17)
ax.text(x=-80000, y=22.5,
s='Top 20 countries by death toll (December 2020)',
size=12)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.set_xticklabels(['0', '150,000', '300,000'])
ax.set_yticklabels([]) # an empty list removes the labels
country_names = top20_deathtoll['Country_Other']
for i, country in zip(range(20), country_names):
ax.text(x=-80000, y=i-0.15, s=country)
ax.axvline(x=150000, ymin=0.045, c='grey',

alpha=0.5)
plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)
The Lifecycle of a Plot — Matplotlib 3.5.1 documenta
Examples — Matplotlib 3.5.1 documentation n

Design for an Audience
1. Data Stories
2. Grid Charts in Matplotlib
import pandas as pd
death_toll = pd.read_csv('covid_avg_deaths.csv')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1,
figsize=(6,8))
ax1.plot(death_toll['Month'], death_toll['New_deaths'])
plt.show()
3. Faster Workflow
import pandas as pd
figsize=(6,8))
axes = [ax1, ax2, ax3, ax4]
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'])
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
plt.show()
4. Modifying the Line Plots

import pandas as pd
figsize=(6,8))

for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
plt.show()
5. Adding Structural Elements

import pandas as pd

figsize=(6,8))
for ax in axes:
ax1.text(0.5, -80, '0', alpha=0.5)

ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
ax1.text(1.1, -300, 'Jan - Mar', color='#af0b1e',

weight='bold', rotation=3)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
plt.show()
6. Title and Subtitle
WHO Coronavirus (COVID-19) Dashboard | WHO Coronavirus (COVID-19) Dashboard With
Vaccination Data
import pandas as pd
figsize=(6,8))
for ax in axes:
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)

rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax1.text(0.5, 3500, 'The virus kills 851 people each day',
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
plt.show()
7. Adding a Progress Bar

import pandas as pd
figsize=(6,8))
for ax in axes:
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
size=12)
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
for ax in axes:
ax.axhline(y=1600, xmin=0.5, xmax=0.8,
linewidth=6, color='#af0b1e',
alpha=0.1)
'''
Alternatively, you can integrate this code
inside the first for loop above.
'''
plt.show()
8. Completing the Progress Bar

import pandas as pd
deaths = [2398, 126203, 227178, 295406]
proportions = [round(death/295406, 2) for death in deaths]
xmax_vals = [round(0.5 + proportion * 0.3, 3) for proportion in proportions]
figsize=(6,8))
for ax in axes:
ax1.text(0.5, -80, '0', alpha=0.5)
ax1.text(3.5, 2000, '1,844', alpha=0.5)
ax1.text(11.5, 2400, '2,247', alpha=0.5)
size=12)
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
for ax, xmax, death in zip(axes, xmax_vals, deaths):
ax.axhline(y=1600, xmin=0.5, xmax=0.8,
linewidth=6, color='#af0b1e',
alpha=0.1)
ax.axhline(y=1600, xmin=0.5, xmax=xmax,
linewidth=6, color='#af0b1e')
ax.text(7.5, 1850, format(death, ','),
color='#af0b1e', weight='bold')
plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)
Examples — Matplotlib 3.5.1 documentation
Is Your Data Story Actually A Story? | by Joshua Smith | Nightingale | Medium
Gestalt Principles and Pre-Attentive Attributes
1. Gestalt Principles
Link:
Gestalt Principles And Pre-attentive Attributes — Gestalt Principles | Dataquest!!!!!!!!!!!!

2. Proximity
sentence_1 = True
sentence_2 = True
sentence_3 = False
3. Similarity
sentence_1 = False # It's similarity of color, not shape
sentence_2 = True
sentence_3 = True
4. Enclosure
sentence_1 = True
sentence_2 = True
sentence_3 = True
5. Connection
6. Visual Hierarchy
sentence_1 = False # Enclosure is stronger.
sentence_2 = True
sentence_3 = False # Enclosure is stronger.
sentence_4 = False # The human perception is non-random.
7. Pre-Attentive Attributes
sentence_1 = True
sentence_2 = False
sentence_3 = True
Pre-attentive processing - Wikipedia
Gestalt psychology - Wikipedia
Matplotlib Styles: FiveThirtyEight Case Study

import matplotlib.style as style
style.use('ggplot')
plt.plot([2, 4, 6], [10, 15, 5])
plt.show()
style.use('default')
plt.plot([2, 4, 6], [10, 15, 5])
plt.show()
2. Wine Quality Dataset

import pandas as pd
red_wine = pd.read_csv('winequality-red.csv', sep=';')
red_corr = red_wine.corr()['quality'][:-1]
white_wine = pd.read_csv('winequality-white.csv', sep=';')
white_corr = white_wine.corr()['quality'][:-1]
print(white_corr)
3. FiveThirtyEight Style
# Initial Code
style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(9, 5))
# ax.barh(white_corr.index, white_corr, left=2)
# ax.barh(red_corr.index, red_corr)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.barh(white_corr.index, white_corr, left=2, height=0.5)
ax.barh(red_corr.index, red_corr, height=0.5)
ax.grid(b=False)
plt.show()
4. Adding Y-tick Labels

ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)
ax.grid(b=False)
x_coords = {'Alcohol': 0.82, 'Sulphates': 0.77, 'pH': 0.91,
'Density': 0.80, 'Total Sulfur Dioxide': 0.59,
'Free Sulfur Dioxide': 0.6, 'Chlorides': 0.77,
'Residual Sugar': 0.67, 'Citric Acid': 0.76,
'Volatile Acidity': 0.67, 'Fixed Acidity': 0.71}
y_coord = 9.8
for y_label, x_coord in x_coords.items():
ax.text(x_coord, y_coord, y_label)
y_coord -= 1
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.axvline(0.5, c='grey', alpha=0.1, linewidth=1,
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
plt.show()
5. Adding X-tick Labels
ax.grid(b=False)

y_coord = 9.8
y_coord -= 1
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
ax.axhline(-1, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',
color='grey', alpha=0.5)
xmin=0.67, xmax=0.98)
ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.axhline(11, color='grey', linewidth=1, alpha=0.5,
xmin=0.01, xmax=0.32)
ax.text(-0.33, 11.2, 'RED WINE', weight='bold')
xmin=0.67, xmax=0.98)
ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')

plt.show()
6. Adding a Signature
ax.grid(b=False)

y_coord = 9.8
y_coord -= 1
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
xmin=0.01, xmax=0.32)
ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',
xmin=0.67, xmax=0.98)
ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',
xmin=0.01, xmax=0.32)
xmin=0.67, xmax=0.98)
ax.text(-0.7, -2.9,
'©DATAQUEST' + ' '*94 + 'Source: P. Cortez et al.',
color = '#f0f0f0', backgroundcolor = '#4d4d4d',
size=12)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.text(-0.7, 13.5,
'Wine Quality Most Strongly Correlated With Alcohol Level',
ax.text(-0.7, 12.7,
'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')
plt.show()
7. Coloring Bars Differently

positive_white = white_corr >= 0
color_map_white = positive_white.map({True:'#33A1C9', False:'#ffae42'})
ax.barh(white_corr.index, white_corr, left=2, height=0.5,
color=color_map_white)
#ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1)
ax.grid(b=False)

y_coord = 9.8
y_coord -= 1
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
xmin=0.01, xmax=0.32)
ax.text(-0.7, -1.7, '-0.5'+ ' '*31 + '+0.5',
xmin=0.67, xmax=0.98)
ax.text(1.43, -1.7, '-0.5'+ ' '*31 + '+0.5',
xmin=0.01, xmax=0.32)
xmin=0.67, xmax=0.98)
ax.text(-0.7, -2.9, '©DATAQUEST' + ' '*92 + 'Source: P. Cortez et al.',
color = '#f0f0f0', backgroundcolor = '#4d4d4d',
size=12)
ax.text(-0.7, 13.5,
'Wine Quality Most Strongly Correlated With Alcohol Level',
ax.text(-0.7, 12.7,
'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
positive_red = red_corr >= 0
color_map_red = positive_red.map({True:'#33A1C9', False:'#ffae42'})
ax.barh(red_corr.index, red_corr, height=0.5, left=-0.1,
color=color_map_red)
plt.show()
How to Generate FiveThirtyEight Graphs in Python – Dataquest
Guided Project: Storytelling Data Visualization on Exchange Rates

An In-Depth Style Guide for Data Science Projects – Dataquest
Latest 529 topics - Dataquest Community

Data Aggregation
World Happiness Report | Kaggle

happiness2015 = pd.read_csv("World_Happiness_2015.csv")
first_5 = happiness2015.head()
happiness2015.info()
mean_happiness = {}
regions = happiness2015['Region'].unique()
for r in regions:
region_group = happiness2015[happiness2015['Region'] == r]
region_mean = region_group['Happiness Score'].mean()
mean_happiness[r] = region_mean
grouped = happiness2015.groupby('Region')
aus_nz = grouped.get_group('Australia and New Zealand')

north_america = happiness2015.iloc[[4,14]]
na_group = grouped.get_group('North America')
equal = north_america == na_group

means = grouped.mean()
happy_grouped = grouped['Happiness Score']
happy_mean = happy_grouped.mean()
import numpy as np
happy_grouped = grouped['Happiness Score']
def dif(group):
return (group.max() - group.mean())
happy_mean_max = happy_grouped.agg([np.mean, np.max])
mean_max_dif = happy_grouped.agg(dif)
happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()
happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()
print(happiness_means)
pv_happiness = happiness2015.pivot_table(values='Happiness Score', index='Region',
aggfunc=np.mean, margins=True)
pv_happiness.plot(kind='barh', xlim=(0,10), title='Mean Happiness Scores by Region', legend=False)
world_mean_happiness = happiness2015['Happiness Score'].mean()

grouped = happiness2015.groupby('Region')[['Happiness Score','Family']]
happy_family_stats = grouped.agg([np.min, np.max, np.mean])
pv_happy_family_stats = happiness2015.pivot_table(['Happiness Score', 'Family'], 'Region',

aggfunc=[np.min, np.max, np.mean], margins=True)
Combining Data Using Pandas
import pandas as pd
happiness2015['Year'] = 2015
Combining Data Using Pandas

Merge, join, concatenate and compare — pandas 1.4.1 documentation (pydata.org)
Transforming Data with Pandas
mapping = {'Economy (GDP per Capita)': 'Economy', 'Health (Life Expectancy)': 'Health', 'Trust
(Government Corruption)': 'Trust' }
happiness2015 = happiness2015.rename(mapping, axis = 1)

def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_impact_map = happiness2015['Economy'].map(label)
economy_impact_apply = happiness2015['Economy'].apply(label)
equal = economy_impact_map.equals(economy_impact_apply)
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_impact_apply = happiness2015['Economy'].apply(label)
def label(element, x):
if element > x:
return 'High'
else:
return 'Low'
economy_impact_apply = happiness2015['Economy'].apply(label, x = .8)
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_apply = happiness2015['Economy'].apply(label)
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity']
factors_impact = happiness2015[factors].applymap(label)
def v_counts(col):
num = col.value_counts()
den = col.size
return num/den
v_counts_pct = factors_impact.apply(v_counts)
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']
def percentages(col):
div = col/happiness2015['Happiness Score']
return div * 100
factor_percentages = happiness2015[factors].apply(percentages)
main_cols = ['Country', 'Region', 'Happiness Rank', 'Happiness Score']
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']
melt = pd.melt(happiness2015, id_vars = main_cols, value_vars = factors)
melt['Percentage'] = round(melt['value']/melt['Happiness Score'] * 100, 2)
melt = pd.melt(happiness2015, id_vars = ['Country', 'Region', 'Happiness Rank', 'Happiness Score'],

value_vars= ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual'])
melt['Percentage'] = round(melt['value']/melt['Happiness Score'] * 100, 2)

pv_melt = melt.pivot_table(index='variable', values='value')
pv_melt.plot(kind='pie', y='value', legend=False)

Tidy Data | Journal of Statistical Software (jstatsoft.org)
Working with Strings In Pandas

world_dev = pd.read_csv("World_dev.csv")
col_renaming = {'SourceOfMostRecentIncomeAndExpenditureData': 'IESurvey'}
merged = pd.merge(left=happiness2015, right=world_dev, how='left', left_on='Country',

right_on='ShortName')
merged = merged.rename(col_renaming, axis=1)
def extract_last_word(element):
return str(element).split()[-1]
merged['Currency Apply'] = merged['CurrencyUnit'].apply(extract_last_word)

merged['Currency Vectorized'] = merged['CurrencyUnit'].str.split().str.get(-1)
print(merged['Currency Vectorized'].head())
lengths = merged['CurrencyUnit'].str.len()
value_counts = lengths.value_counts(dropna=False)
pattern = r"[Nn]ational accounts"
national_accounts = merged['SpecialNotes'].str.contains(pattern)
print(national_accounts.head())
pattern = r"[Nn]ational accounts"
national_accounts = merged['SpecialNotes'].str.contains(r"[Nn]ational accounts", na=False)
merged_national_accounts = merged[national_accounts]
print(merged_national_accounts.head())
pattern =r"()"
pattern = r"([1-2][0-9]{3})"
years = merged['SpecialNotes'].str.extract(pattern)
pattern = r"(?P<Years>[1-2][0-9]{3})"
years = merged['IESurvey'].str.extractall(pattern)
value_counts = years['Years'].value_counts()
print(value_counts)
pattern = r"(?P<First_Year>[1-2][0-9]{3})/?(?P<Second_Year>[0-9]{2})?"
years = merged['IESurvey'].str.extractall(pattern)
first_two_year = years['First_Year'].str[0:2]
years['Second_Year'] = first_two_year + years['Second_Year']

merged['IncomeGroup'] = merged['IncomeGroup'].str.replace(' income', '').str.replace(':',
'').str.upper()
pv_incomes = merged.pivot_table(values='Happiness Score', index='IncomeGroup')
pv_incomes.plot(kind='bar', rot=30, ylim=(0,10))
plt.show()
Working with text data — pandas 1.4.1 documentation (pydata.org)
6.2. re — Regular expression operations — Python 3.4.10 documentation
Working With Missing And Duplicate Dana
shape_2015 = happiness2015.shape
missing_2016 = happiness2016.isnull().sum()
missing_2017 = happiness2017.isnull().sum()
happiness2017.columns = happiness2017.columns.str.replace('.', ' ').str.replace('\s+', '

').str.strip().str.upper()
happiness2015.columns = happiness2015.columns.str.replace('(', '').str.replace(')',

'').str.strip().str.upper()
happiness2016.columns = happiness2016.columns.str.replace('(', '').str.replace(')',

'').str.strip().str.upper()
combined = pd.concat([happiness2015, happiness2016, happiness2017], ignore_index=True)
missing = combined.isnull().sum()
regions_2017 = combined[combined['YEAR']==2017]['REGION']
missing = regions_2017.isnull().sum()
combined = pd.merge(left=combined, right=regions, on='COUNTRY', how='left')
combined = combined.drop('REGION_x', axis = 1)
combined['COUNTRY'] = combined['COUNTRY'].str.upper()
dups = combined.duplicated(['COUNTRY', 'YEAR'])
print(combined[dups])
combined['COUNTRY'] = combined['COUNTRY'].str.upper()
combined = combined.drop_duplicates(['COUNTRY', 'YEAR'])
columns_to_drop = ['LOWER CONFIDENCE INTERVAL', 'STANDARD ERROR', 'UPPER CONFIDENCE

INTERVAL', 'WHISKER HIGH', 'WHISKER LOW']
combined = combined.drop(columns_to_drop, axis = 1)
combined = combined.dropna(thresh=159, axis=1)
happiness_mean = combined['HAPPINESS SCORE'].mean()
print(happiness_mean)
combined['HAPPINESS SCORE UPDATED'] = combined['HAPPINESS SCORE'].fillna(happiness_mean)
print(combined['HAPPINESS SCORE UPDATED'].mean())
combined = combined.dropna()
Working with missing data — pandas 1.4.1 documentation (pydata.org)
Regular Expression Basics
import re
titles = hn["title"].tolist()
python_mentions = 0
pattern = "[Pp]ython"
for t in titles:
if re.search(pattern, t):
python_mentions += 1
re — Regular expression operations — Python 3.10.2 documentation
RegExr: Learn, Build, & Test RegEx
Advanced Regular Expressions
import pandas as pd
import re
hn = pd.read_csv("hacker_news.csv")
titles = hn['title']
sql_pattern = r"SQL"
sql_counts = titles.str.contains(sql_pattern, flags=re.I).sum()

hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()
hn_sql["flavor"] = hn_sql["title"].str.extract(r"(\w+SQL)", re.I, expand=False)
hn_sql["flavor"] = hn_sql["flavor"].str.lower()
sql_pivot = hn_sql.pivot_table(index="flavor",values="num_comments", aggfunc='mean')
pattern = r"[Pp]ython ([\d\.]+)"
py_versions = titles.str.extract(pattern, expand=False)
py_versions_freq = dict(py_versions.value_counts())
def first_10_matches(pattern):
"""
Return the first 10 story titles that match
the provided regular expression
"""
all_matches = titles[titles.str.contains(pattern)]
first_10 = all_matches.head(10)
return first_10
# pattern = r"\b[Cc]\b"
pattern = r"\b[Cc]\b[^.+]"
first_ten = first_10_matches(pattern)
pattern = r"(?<!Series\s)\b[Cc]\b((?![+.])|\.$)"
c_mentions = titles.str.contains(pattern).sum()
pattern = r"\b(\w+)\s\1\b"
repeated_words = titles[titles.str.contains(pattern)]
email_variations = pd.Series(['email', 'Email', 'e Mail',
'e mail', 'E-mail', 'e-mail',
'eMail', 'E-Mail', 'EMAIL'])
pattern = r"\be[-\s]?mail"
email_uniform = email_variations.str.replace(pattern, "email", flags=re.I)
titles_clean = titles.str.replace(pattern, "email", flags=re.I)

test_urls = pd.Series([
'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
'http://www.interactivedynamicvideo.com/',
'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
'HTTPS://github.com/keppel/pinn',
'Http://phys.org/news/2015-09-scale-solar-youve.html',
'https://iot.seeed.cc',
'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
'http://beta.crowdfireapp.com/?beta=agnipath',
'https://www.valid.ly?param',
'http://css-cursor.techstream.org'
])
pattern = r"https?://([\w\-\.]+)"
test_urls_clean = test_urls.str.extract(pattern, flags=re.I, expand=False)
domains = hn['url'].str.extract(pattern, flags=re.I, expand=False)
top_domains = domains.value_counts().head(5)
# `test_urls` is available from the previous screen
pattern = r"(https?)://([\w\.\-]+)/?(.*)"
test_url_parts = test_urls.str.extract(pattern, flags=re.I)
url_parts = hn['url'].str.extract(pattern, flags=re.I)
# pattern = r"(https?)://([\w\.\-]+)/?(.*)"
pattern = r"(?P<protocol>https?)://(?P<domain>[\w\.\-]+)/?(?P<path>.*)"
url_parts = hn['url'].str.extract(pattern, flags=re.I)

re — Regular expression operations — Python 3.10.2 documentation
RegExr: Learn, Build, & Test RegEx

List Comprehensions and Lambda Functions
world_cup_str = """
"team_1": "France",
"team_2": "Croatia",
"game_type": "Final",
"score" : [4, 2]
},
"team_1": "Belgium",
"team_2": "England",
"game_type": "3rd/4th Playoff",
"score" : [2, 0]
]
"""
import json
world_cup_obj = json.loads(world_cup_str)
JSON
json — JSON encoder and decoder — Python 3.7.12 documentation
5. Data Structures — Python 3.10.2 documentation
4. More Control Flow Tools — Python 3.10.2 documentation
Working with Missing Dana

Data Analyst in Python 2

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Data Analyst in Python 2

Uploaded by

Copyright:

Available Formats

Bar Plots, Histograms, and Distributions

import matplotlib.pyplot as plt

plt.title('Working Day Vs. Casual')

plt.title('Working Day Vs. Registered')

working_days = ['Non-Working Day', 'Working Day']

registered_avg = [2959, 3978]

import matplotlib.pyplot as plt

labels=['Sunday', 'Monday', 'Tuesday', 'Wednesday',

'Thursday', 'Friday', 'Saturday'],

weather_2011 = [226, 124, 15, 0]

weather_2012 = [237, 123, 6, 0]

plt.title('Weather Patterns: 2012')

5.Grouped Frequency Tables

import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

8. The Uniform Distribution

Histogram - Learn about this chart and tools to create it (datavizcatalogue.com)

Frequency Distribution (mathsisfun.com)

Grouped Frequency Distribution (mathsisfun.com)

Normal Distribution (mathsisfun.com)

Pandas Visualizations and Grid Charts

1. Traffic Congestions in Sao Paulo

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

import matplotlib.pyplot as plt

plt.hist(traffic['Slowness in traffic (%)'])

4. Pandas Visualization Methods

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

traffic['Slowness in traffic (%)'].plot.hist()

plt.title('Distribution of Slowness in traffic (%)')

plt.xlabel('Slowness in traffic (%)')

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

incidents = traffic.drop(['Hour (Coded)', 'Slowness in traffic (%)'],

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

traffic.plot.scatter(x='Slowness in traffic (%)',

traffic.plot.scatter(x='Slowness in traffic (%)',

traffic.plot.scatter(x='Slowness in traffic (%)',

7. Traffic Slowness Over 20%

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

slowness_20_or_more = traffic[traffic['Slowness in traffic (%)'] >= 20]

slowness_20_or_more = slowness_20_or_more.drop(['Slowness in traffic (%)',

'Hour (Coded)'], axis=1)

import matplotlib.pyplot as plt

traffic = pd.read_csv('traffic_sao_paulo.csv', sep=';')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')

traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

for i, day in zip(range(0, 135, 27), days):

for day in days:

y='Slowness in traffic (%)')

import matplotlib.pyplot as plt