Professional Documents
Culture Documents
Data Analyst in Python 2
Data Analyst in Python 2
import pandas as pd
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
plt.scatter(bike_sharing['workingday'], bike_sharing['casual'])
plt.scatter(bike_sharing['workingday'], bike_sharing['registered'])
plt.show()
2. Bar Plots
import matplotlib.pyplot as plt
plt.bar(working_days, registered_avg)
plt.show()
3.Customizing Bar Plots
import pandas as pd
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
weekday_averages = bike_sharing.groupby('weekday').mean()[['casual', 'registered']].reset_index() #
It's not essential to understand how this code works, we'll cover this in a later course
plt.bar(weekday_averages['weekday'], weekday_averages['registered'])
plt.xticks(ticks=[0, 1, 2, 3, 4, 5, 6],
rotation=30)
plt.show()
4. Frequency Tables
import matplotlib.pyplot as plt
unique_values = [1, 2, 3, 4]
plt.bar(unique_values, weather_2011)
plt.xticks(ticks=[1,2,3,4])
plt.title('Weather Patterns: 2011')
plt.ylabel('Frequency')
plt.xlabel('Unique Values')
plt.show()
plt.bar(unique_values, weather_2012)
plt.xticks(ticks=[1,2,3,4])
plt.ylabel('Frequency')
plt.xlabel('Unique Values')
plt.show()
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
registered_freq = bike_sharing['registered'].value_counts(bins=10).sort_index()
casual_freq = bike_sharing['casual'].value_counts(bins=10).sort_index()
6.Histograms
import pandas as pd
bike_sharing = pd.read_csv('day.csv')
bike_sharing['dteday'] = pd.to_datetime(bike_sharing['dteday'])
plt.hist(bike_sharing['casual'])
plt.show()
7.The Normal Distribution
sentence_1 = True
sentence_2 = False
sentence_3 = True
sentence_4 = True
sentence_5 = False
sentence_2 = False
sentence_3 = False
sentence_4 = False
9. Skewed Distributions
Bar Charts - Learn about this chart and tools to create it (datavizcatalogue.com)
traffic.head()
traffic.tail()
traffic.info()
3. Slowness in Traffic
import pandas as pd
plt.show()
sentence_1 = True
sentence_2 = True
sentence_3 = False
import pandas as pd
plt.show()
5. Frequency of Incidents
import pandas as pd
axis=1)
incidents.sum().plot.barh()
plt.show()
sentence_1 = False
sentence_2 = True
sentence_3 = True
6. Correlations with Traffic Slowness
import pandas as pd
y='Lack of electricity')
plt.show()
y='Point of flooding')
plt.show()
y='Semaphore off')
plt.show()
incident_frequencies = slowness_20_or_more.sum()
incident_frequencies.plot.barh()
plt.show()
8. How Traffic Slowness Change
import pandas as pd
traffic_per_day = {}
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
traffic_per_day[day].plot.line(x='Hour (Coded)',
plt.title(day)
plt.ylim([0, 25])
plt.show()
9. Comparing Graphs
import pandas as pd
traffic_per_day = {}
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
plt.plot(traffic_per_day[day]['Hour (Coded)'],
label=day)
plt.legend()
plt.show()
plt.figure()
plt.subplot(3, 2, 1)
plt.subplot(3, 2, 2)
plt.subplot(3, 2, 6)
plt.subplot(3, 2, 3)
plt.subplot(3, 2, 4)
plt.subplot(3, 2, 5)
plt.show()
traffic_per_day = {}
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
plt.figure(figsize=(10,12))
plt.subplot(3, 2, i)
plt.plot(traffic_per_day[day]['Hour (Coded)'],
plt.title(day)
plt.ylim([0,25])
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
traffic_per_day = {}
each_day_traffic = traffic[i:i+27]
traffic_per_day[day] = each_day_traffic
plt.figure(figsize=(10,12))
plt.subplot(3, 2, i)
plt.plot(traffic_per_day[day]['Hour (Coded)'],
plt.title(day)
plt.ylim([0,25])
plt.subplot(3, 2, 6)
plt.plot(traffic_per_day[day]['Hour (Coded)'],
label=day)
plt.ylim([0,25])
plt.legend()
plt.show()
Chart Visualization — pandas 1.4.1 documentation (pydata.org)
housing = pd.read_csv('housing.csv')
sns.set_theme()
plt.show()
correlation = 'positive'
import pandas as pd
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
sentence_1 = True
sentence_2 = True
import pandas as pd
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
plt.show()
sentence_1 = False
sentence_2 = True
import pandas as pd
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
style='Rooms')
plt.show()
sentence_1 = False
sentence_2 = False
housing = pd.read_csv('housing.csv')
# sns.set_theme()
# style='Rooms')
# plt.show()
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
sns.set_theme()
style='Rooms', col='Year')
plt.show()
sentence_1 = True
sentence_2 = True
Visualizing statistical relationships — seaborn 0.11.2 documentation (pydata.org)
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
plt.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
3. Matplotlib Interfaces
Data Visualization With Matplotlib Course | Dataquest
4. The OO Interface
import pandas as pd
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
fig, ax = plt.subplots()
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
5. Mobile-Friendly Proportions
import pandas as pd
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
plt.show()
6. Maximizing Data-Ink
7. Erasing Non-Data Ink
import pandas as pd
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'])
ax.spines[location].set_visible(False)
ax.tick_params(bottom=False, left=False)
plt.show()
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
# Initial Code
# ax.barh(top20_deathtoll['Country_Other'],
# top20_deathtoll['Total_Deaths'])
# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45)
ax.spines[location].set_visible(False)
ax.tick_params(bottom=False, left=False)
plt.show()
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
# Initial Code
# ax.barh(top20_deathtoll['Country_Other'],
# top20_deathtoll['Total_Deaths'],
# height=0.45)
# for location in ['left', 'right', 'top', 'bottom']:
# ax.spines[location].set_visible(False)
# ax.tick_params(bottom=False, left=False)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')
ax.spines[location].set_visible(False)
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
plt.show()
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')
ax.spines[location].set_visible(False)
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.text(x=-80000, y=23.5,
weight='bold', size=17)
ax.text(x=-80000, y=22.5,
size=12)
plt.show()
import pandas as pd
top20_deathtoll = pd.read_csv('top20_deathtoll.csv')
ax.barh(top20_deathtoll['Country_Other'],
top20_deathtoll['Total_Deaths'],
height=0.45, color='#af0b1e')
ax.spines[location].set_visible(False)
ax.xaxis.tick_top()
ax.tick_params(top=False, left=False)
ax.tick_params(axis='x', colors='grey')
ax.text(x=-80000, y=23.5,
weight='bold', size=17)
ax.text(x=-80000, y=22.5,
size=12)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
country_names = top20_deathtoll['Country_Other']
plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)
1. Data Stories
2. Grid Charts in Matplotlib
import pandas as pd
death_toll = pd.read_csv('covid_avg_deaths.csv')
figsize=(6,8))
ax1.plot(death_toll['Month'], death_toll['New_deaths'])
ax2.plot(death_toll['Month'], death_toll['New_deaths'])
ax3.plot(death_toll['Month'], death_toll['New_deaths'])
ax4.plot(death_toll['Month'], death_toll['New_deaths'])
plt.show()
3. Faster Workflow
import pandas as pd
death_toll = pd.read_csv('covid_avg_deaths.csv')
figsize=(6,8))
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'])
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
ax.spines[location].set_visible(False)
plt.show()
death_toll = pd.read_csv('covid_avg_deaths.csv')
figsize=(6,8))
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
plt.show()
death_toll = pd.read_csv('covid_avg_deaths.csv')
# Solution Code
ax2.text(3.7, 800, 'Mar - Jun', color='#af0b1e', weight='bold')
ax3.text(7.1, 500, 'Jun - Oct', color='#af0b1e', weight='bold')
ax4.text(10.5, 600, 'Oct - Dec', color='#af0b1e', weight='bold',
rotation=45)
plt.show()
6. Title and Subtitle
WHO Coronavirus (COVID-19) Dashboard | WHO Coronavirus (COVID-19) Dashboard With
Vaccination Data
import pandas as pd
import matplotlib.pyplot as plt
death_toll = pd.read_csv('covid_avg_deaths.csv')
figsize=(6,8))
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
weight='bold', rotation=3)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
color='#af0b1e', linewidth=2.5)
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
plt.show()
death_toll = pd.read_csv('covid_avg_deaths.csv')
figsize=(6,8))
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
for location in ['left', 'right', 'top', 'bottom']:
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
weight='bold', rotation=3)
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
for ax in axes:
ax.axhline(y=1600, xmin=0.5, xmax=0.8,
linewidth=6, color='#af0b1e',
alpha=0.1)
'''
'''
plt.show()
death_toll = pd.read_csv('covid_avg_deaths.csv')
figsize=(6,8))
for ax in axes:
ax.plot(death_toll['Month'], death_toll['New_deaths'],
color='#af0b1e', alpha=0.1)
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.tick_params(bottom=0, left=0)
ax.spines[location].set_visible(False)
ax1.plot(death_toll['Month'][:3], death_toll['New_deaths'][:3],
color='#af0b1e', linewidth=2.5)
ax1.text(0.5, -80, '0', alpha=0.5)
weight='bold', rotation=3)
size=14, weight='bold')
ax1.text(0.5, 3150, 'Average number of daily deaths per month in the US',
size=12)
ax2.plot(death_toll['Month'][2:6], death_toll['New_deaths'][2:6],
color='#af0b1e', linewidth=2.5)
ax3.plot(death_toll['Month'][5:10], death_toll['New_deaths'][5:10],
color='#af0b1e', linewidth=2.5)
ax4.plot(death_toll['Month'][9:12], death_toll['New_deaths'][9:12],
color='#af0b1e', linewidth=2.5)
rotation=45)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
linewidth=6, color='#af0b1e',
alpha=0.1)
ax.axhline(y=1600, xmin=0.5, xmax=xmax,
linewidth=6, color='#af0b1e')
color='#af0b1e', weight='bold')
plt.show()
The Visual Display of Quantitative Information by Edward R. Tufte (goodreads.com)
1. Gestalt Principles
Link:
sentence_2 = True
sentence_3 = False
3. Similarity
sentence_1 = False # It's similarity of color, not shape
sentence_2 = True
sentence_3 = True
4. Enclosure
sentence_1 = True
sentence_2 = True
sentence_3 = True
5. Connection
6. Visual Hierarchy
sentence_1 = False # Enclosure is stronger.
sentence_2 = True
7. Pre-Attentive Attributes
sentence_1 = True
sentence_2 = False
sentence_3 = True
Pre-attentive processing - Wikipedia
plt.show()
style.use('default')
plt.show()
red_corr = red_wine.corr()['quality'][:-1]
white_corr = white_wine.corr()['quality'][:-1]
print(white_corr)
3. FiveThirtyEight Style
# Initial Code
style.use('fivethirtyeight')
# ax.barh(red_corr.index, red_corr)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.show()
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
y_coord = 9.8
y_coord -= 1
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
plt.show()
5. Adding X-tick Labels
style.use('fivethirtyeight')
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
y_coord = 9.8
y_coord -= 1
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
xmin=0.01, xmax=0.32)
color='grey', alpha=0.5)
xmin=0.67, xmax=0.98)
color='grey', alpha=0.5)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
xmin=0.01, xmax=0.32)
xmin=0.67, xmax=0.98)
6. Adding a Signature
style.use('fivethirtyeight')
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
y_coord = 9.8
y_coord -= 1
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
xmin=0.01, xmax=0.32)
color='grey', alpha=0.5)
xmin=0.67, xmax=0.98)
color='grey', alpha=0.5)
xmin=0.01, xmax=0.32)
xmin=0.67, xmax=0.98)
ax.text(-0.7, -2.9,
size=12)
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
ax.text(-0.7, 13.5,
size=17, weight='bold')
ax.text(-0.7, 12.7,
'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')
plt.show()
style.use('fivethirtyeight')
color=color_map_white)
ax.grid(b=False)
ax.set_yticklabels([])
ax.set_xticklabels([])
y_coord = 9.8
y_coord -= 1
ymin=0.1, ymax=0.9)
ymin=0.1, ymax=0.9)
xmin=0.01, xmax=0.32)
color='grey', alpha=0.5)
xmin=0.67, xmax=0.98)
color='grey', alpha=0.5)
xmin=0.01, xmax=0.32)
xmin=0.67, xmax=0.98)
ax.text(1.75, 11.2, 'WHITE WINE', weight='bold')
size=12)
ax.text(-0.7, 13.5,
size=17, weight='bold')
ax.text(-0.7, 12.7,
'Correlation values between wine quality and wine properties (alcohol, pH, etc.)')
# *** *** *** *** *** *** *** *** *** ***
# Solution Code
color=color_map_red)
plt.show()
How to Generate FiveThirtyEight Graphs in Python – Dataquest
first_5 = happiness2015.head()
happiness2015.info()
mean_happiness = {}
regions = happiness2015['Region'].unique()
for r in regions:
region_group = happiness2015[happiness2015['Region'] == r]
mean_happiness[r] = region_mean
grouped = happiness2015.groupby('Region')
north_america = happiness2015.iloc[[4,14]]
means = grouped.mean()
grouped = happiness2015.groupby('Region')
happy_mean = happy_grouped.mean()
import numpy as np
grouped = happiness2015.groupby('Region')
def dif(group):
mean_max_dif = happy_grouped.agg(dif)
happiness_means = happiness2015.groupby('Region')['Happiness Score'].mean()
print(happiness_means)
pv_happiness = happiness2015.pivot_table(values='Happiness Score', index='Region',
aggfunc=np.mean, margins=True)
happiness2015 = pd.read_csv("World_Happiness_2015.csv")
happiness2016 = pd.read_csv("World_Happiness_2016.csv")
happiness2017 = pd.read_csv("World_Happiness_2017.csv")
happiness2015['Year'] = 2015
happiness2016['Year'] = 2016
happiness2017['Year'] = 2017
mapping = {'Economy (GDP per Capita)': 'Economy', 'Health (Life Expectancy)': 'Health', 'Trust
(Government Corruption)': 'Trust' }
if element > 1:
return 'High'
else:
return 'Low'
economy_impact_map = happiness2015['Economy'].map(label)
economy_impact_apply = happiness2015['Economy'].apply(label)
equal = economy_impact_map.equals(economy_impact_apply)
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_impact_apply = happiness2015['Economy'].apply(label)
if element > x:
return 'High'
else:
return 'Low'
def label(element):
if element > 1:
return 'High'
else:
return 'Low'
economy_apply = happiness2015['Economy'].apply(label)
factors_impact = happiness2015[factors].applymap(label)
def v_counts(col):
num = col.value_counts()
den = col.size
return num/den
v_counts_pct = factors_impact.apply(v_counts)
factors = ['Economy', 'Family', 'Health', 'Freedom', 'Trust', 'Generosity', 'Dystopia Residual']
def percentages(col):
factor_percentages = happiness2015[factors].apply(percentages)
def extract_last_word(element):
return str(element).split()[-1]
print(merged['Currency Vectorized'].head())
lengths = merged['CurrencyUnit'].str.len()
value_counts = lengths.value_counts(dropna=False)
national_accounts = merged['SpecialNotes'].str.contains(pattern)
print(national_accounts.head())
pattern = r"[Nn]ational accounts"
merged_national_accounts = merged[national_accounts]
print(merged_national_accounts.head())
pattern =r"()"
pattern = r"([1-2][0-9]{3})"
years = merged['SpecialNotes'].str.extract(pattern)
pattern = r"(?P<Years>[1-2][0-9]{3})"
years = merged['IESurvey'].str.extractall(pattern)
value_counts = years['Years'].value_counts()
print(value_counts)
pattern = r"(?P<First_Year>[1-2][0-9]{3})/?(?P<Second_Year>[0-9]{2})?"
years = merged['IESurvey'].str.extractall(pattern)
first_two_year = years['First_Year'].str[0:2]
plt.show()
Working with text data — pandas 1.4.1 documentation (pydata.org)
shape_2015 = happiness2015.shape
shape_2016 = happiness2016.shape
shape_2017 = happiness2017.shape
missing_2016 = happiness2016.isnull().sum()
missing_2017 = happiness2017.isnull().sum()
missing = combined.isnull().sum()
regions_2017 = combined[combined['YEAR']==2017]['REGION']
missing = regions_2017.isnull().sum()
missing = combined.isnull().sum()
combined['COUNTRY'] = combined['COUNTRY'].str.upper()
print(combined[dups])
combined['COUNTRY'] = combined['COUNTRY'].str.upper()
missing = combined.isnull().sum()
combined = combined.dropna(thresh=159, axis=1)
missing = combined.isnull().sum()
print(happiness_mean)
combined = combined.dropna()
missing = combined.isnull().sum()
Working with missing data — pandas 1.4.1 documentation (pydata.org)
Regular Expression Basics
import re
titles = hn["title"].tolist()
python_mentions = 0
pattern = "[Pp]ython"
for t in titles:
if re.search(pattern, t):
python_mentions += 1
re — Regular expression operations — Python 3.10.2 documentation
import pandas as pd
import re
hn = pd.read_csv("hacker_news.csv")
titles = hn['title']
sql_pattern = r"SQL"
hn_sql["flavor"] = hn_sql["flavor"].str.lower()
py_versions_freq = dict(py_versions.value_counts())
def first_10_matches(pattern):
"""
"""
all_matches = titles[titles.str.contains(pattern)]
first_10 = all_matches.head(10)
return first_10
# pattern = r"\b[Cc]\b"
pattern = r"\b[Cc]\b[^.+]"
first_ten = first_10_matches(pattern)
pattern = r"(?<!Series\s)\b[Cc]\b((?![+.])|\.$)"
c_mentions = titles.str.contains(pattern).sum()
pattern = r"\b(\w+)\s\1\b"
repeated_words = titles[titles.str.contains(pattern)]
pattern = r"\be[-\s]?mail"
'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
'http://www.interactivedynamicvideo.com/',
'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
'HTTPS://github.com/keppel/pinn',
'Http://phys.org/news/2015-09-scale-solar-youve.html',
'https://iot.seeed.cc',
'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
'http://beta.crowdfireapp.com/?beta=agnipath',
'https://www.valid.ly?param',
'http://css-cursor.techstream.org'
])
pattern = r"https?://([\w\-\.]+)"
top_domains = domains.value_counts().head(5)
# `test_urls` is available from the previous screen
pattern = r"(https?)://([\w\.\-]+)/?(.*)"
# pattern = r"(https?)://([\w\.\-]+)/?(.*)"
pattern = r"(?P<protocol>https?)://(?P<domain>[\w\.\-]+)/?(?P<path>.*)"
"team_1": "France",
"team_2": "Croatia",
"game_type": "Final",
"score" : [4, 2]
},
"team_1": "Belgium",
"team_2": "England",
"score" : [2, 0]
]
"""
import json
world_cup_obj = json.loads(world_cup_str)
JSON