Web Scraping Using Beautiful Soup

27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
WEB SCRAPING USING BEAUTIFUL SOUP

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
In [3]:
## convert/parser the above str data to html type
In [4]:
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'html.parser')
In [5]:
soup
Out[5]:
<html><head><title>The Dormouse's story</title></head>

<body>
<p class="story">Once upon a time there were three little sisters; and th
eir names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> an
d
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
<p class="story">...</p>
</body></html>
In [6]:
type(soup)
Out[6]:
bs4.BeautifulSoup
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 1/11
In [7]:
soup.title
Out[7]:
<title>The Dormouse's story</title>
In [8]:
soup.title.get_text()
Out[8]:
"The Dormouse's story"
In [9]:
soup.p
Out[9]:
In [10]:
#to get the text as inside p there is a b tag

soup.p.b.get_text()
Out[10]:
"The Dormouse's story"
In [11]:
soup.find('p')
Out[11]:
In [12]:
soup.find_all('p')
Out[12]:
[<p class="title"><b>The Dormouse's story</b></p>,

<p class="story">Once upon a time there were three little sisters; and t
heir names were
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> a
nd
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a
>;
and they lived at the bottom of a well.</p>,
<p class="story">...</p>]
In [13]:
#to find specific text based on class or id will use find

#we will get only first p tag where class is story
soup.find('p','story')
Out[13]:
<p class="story">Once upon a time there were three little sisters; and th
eir names were
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> an
d
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
In [14]:
#to find all p tag with class=story

soup.find_all('p','story')
Out[14]:
[<p class="story">Once upon a time there were three little sisters; and t
heir names were
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> a
nd
>;
and they lived at the bottom of a well.</p>,
<p class="story">...</p>]
In [15]:
soup.find('a')
Out[15]:
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
In [16]:
soup.find_all('a')
Out[16]:
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
>]
In [17]:
#you can fetch a particular data by specifying the id(as id is always unique)
soup.find(id='link2')
Out[17]:
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
In [18]:
soup.find(id='link2').get_text()
Out[18]:
'Lacie'
In [19]:
#you can specify the attribute you want to get

soup.find(id='link2')['href']
Out[19]:
'http://example.com/lacie'
In [20]:
"https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"
Out[20]:
'https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals
_in_India'
In [21]:
import urllib.request
In [22]:
page=urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_state_and_union_terr
In [23]:
page
Out[23]:
<http.client.HTTPResponse at 0x7fb671798910>
In [24]:
page.status #200 means site working perfectly
Out[24]:
200
In [25]:
soup=BeautifulSoup(page, 'html.parser')
In [26]:
#For scraping the data you need to go on actual website and inspect the element to find
In [27]:
table=soup.find('table','wikitable sortable')
In [28]:
table
Out[28]:
<table class="wikitable sortable">

<tbody><tr>
<th>State
</th>
<th>Administrative/<br/>Executive capital
</th>
<th>Legislative capital
</th>
<th>Judicial capital
</th>
<th>Year of<br/>establishment
</th>
<th>Former capital
</th></tr>
<tr>
<td><b><a href="/wiki/Andhra_Pradesh" title="Andhra Pradesh">Andhra Prade
sh</a></b>
</td>
In [29]:
#if your data is in table tag you don't need to call any other method, work directly
In [30]:
import pandas as pd
df=pd.read_html(str(table))
In [31]:
df[0]
Out[31]:
Administrative/Executive Judicial Year Fo

State Legislative capital
capital capital ofestablishment ca
Andhra Hyderab
0 Amaravati Amaravati Amaravati 2017
Pradesh (1956–2
Arunachal
1 Itanagar Itanagar Guwahati 1987
Pradesh
Shillo
2 Assam Dispur Dispur Guwahati 1972
(1950–
3 Bihar Patna Patna Patna 1950
4 Chhattisgarh Raipur[c] Raipur Bilaspur 2000
5 Goa Panaji[d] Porvorim Mumbai 1987
Ahmed
6 Gujarat Gandhinagar Gandhinagar Ahmedabad 1970
(1960–
7 Haryana Chandigarh Chandigarh Chandigarh 1966
Shimla
Himachal
8 Shimla (Summer)Dharamshala Shimla 1971
Pradesh
(Winter)[5]
9 Jharkhand Ranchi Ranchi Ranchi 2000
Bangalore (Summer)
10 Karnataka Bangalore Bangalore 1956
Belgaum (Winter)
11 Kerala Thiruvananthapuram Thiruvananthapuram Ernakulam 1956
Madhya
12 Bhopal Bhopal Jabalpur 1956
Pradesh
Mumbai
13 Maharashtra Mumbai[e] (Summer)Nagpur Mumbai 1960
(Winter)
14 Manipur Imphal Imphal Imphal 1972
15 Meghalaya Shillong Shillong Shillong 1972
16 Mizoram Aizawl Aizawl Guwahati 1987
17 Nagaland Kohima Kohima Guwahati 1963
18 Odisha Bhubaneswar Bhubaneswar Cuttack 1950
19 Punjab Chandigarh Chandigarh Chandigarh 1966
20 Rajasthan Jaipur Jaipur Jodhpur 1950
21 Sikkim Gangtok[f] Gangtok Gangtok 1975
22 Tamil Nadu Chennai[g] Chennai Chennai 1956
23 Telangana Hyderabad Hyderabad Hyderabad 2014
24 Tripura Agartala Agartala Agartala 1972
Uttar
25 Lucknow Lucknow Prayagraj 1950
Pradesh
Bhararisain (summer)
26 Uttarakhand Dehradun Nainital 2000
[7]Dehradun (winter)
27 West Bengal Kolkata Kolkata Kolkata 1950
In [32]:
#2nd method when no table tag is there.
In [33]:
import requests
In [34]:
page=requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168"
In [35]:
page.status_code
Out[35]:
200
In [36]:
page
Out[36]:
<Response [200]>
In [37]:
soup=BeautifulSoup(page.content,'html.parser')
In [38]:
soup
Out[38]:
<!DOCTYPE html>
<html class="no-js">
<head>

<meta content="width=device-width" name="viewport"/>
<link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/><title>Nat
ional Weather Service</title><meta content="National Weather Service" nam
e="DC.title"><meta content="NOAA National Weather Service National Weathe
r Service" name="DC.description"/><meta content="US Department of Commerc
e, NOAA, National Weather Service" name="DC.creator"/><meta content="" na
me="DC.date.created" scheme="ISO8601"/><meta content="EN-US" name="DC.lan
guage" scheme="DCTERMS.RFC1766"/><meta content="weather, National Weather
Service" name="DC.keywords"/><meta content="NOAA's National Weather Servi
ce" name="DC.publisher"/><meta content="National Weather Service" name="D
C.contributor"/><meta content="//www.weather.gov/disclaimer.php" name="D
C.rights"/><meta content="General" name="rating"/><meta content="index,fo
llow" name="robots"/>
In [39]:
#here seven-day-forecast is the id found by inspecting the html elements for extracting
seven_day=soup.find(id="seven-day-forecast")
seven_day
Out[39]:
<div class="panel panel-default" id="seven-day-forecast">

<div class="panel-heading">
<b>Extended Forecast for</b>
<h2 class="panel-title">
San Francisco CA </h2>
</div>
<div class="panel-body" id="seven-day-forecast-body">
<div id="seven-day-forecast-container"><ul class="list-unstyled" id="seve
n-day-forecast-list"><li class="forecast-tombstone">
<div class="tombstone-container">
<p class="period-name">This<br/>Afternoon</p>
<p><img alt="This Afternoon: Sunny, with a high near 73. West wind 6 to 9
mph. " class="forecast-icon" src="newimages/medium/few.png" title="This A
fternoon: Sunny, with a high near 73. West wind 6 to 9 mph. "/></p><p cla
ss="short-desc">Sunny</p><p class="temp temp-high">High: 73 °F</p></div>
</li><li class="forecast-tombstone">
<p class="period-name">Tonight<br/><br/></p>
In [41]:
#here tombstone-container is the div class found by inspecting the html elements for ex
containers=seven_day.find_all('div','tombstone-container')
containers
Out[41]:
[<div class="tombstone-container">
<p><img alt="This Afternoon: Sunny, with a high near 73. West wind 6 to
9 mph. " class="forecast-icon" src="newimages/medium/few.png" title="This
Afternoon: Sunny, with a high near 73. West wind 6 to 9 mph. "/></p><p cl
ass="short-desc">Sunny</p><p class="temp temp-high">High: 73 °F</p></div
>,
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Partly cloudy, with a low around 51. West southwes
t wind 3 to 8 mph. " class="forecast-icon" src="newimages/medium/nsct.pn
g" title="Tonight: Partly cloudy, with a low around 51. West southwest wi
nd 3 to 8 mph. "/></p><p class="short-desc">Partly Cloudy</p><p class="te
mp temp-low">Low: 51 °F</p></div>,
<p class="period-name">Thursday<br/><br/></p>
<p><img alt="Thursday: Partly sunny, then gradually becoming sunny, with
a high near 77. Calm wind becoming west 5 to 7 mph in the morning. " clas
In [42]:
#first item of list

containers[0]
Out[42]:
<p><img alt="This Afternoon: Sunny, with a high near 73. West wind 6 to 9
mph. " class="forecast-icon" src="newimages/medium/few.png" title="This A
fternoon: Sunny, with a high near 73. West wind 6 to 9 mph. "/></p><p cla
ss="short-desc">Sunny</p><p class="temp temp-high">High: 73 °F</p></div>
In [43]:
containers[0].find('p','period-name').get_text()
Out[43]:
'ThisAfternoon'
In [44]:
containers[0].find('p','short-desc').get_text()
Out[44]:
'Sunny'
In [45]:
containers[0].find('p','temp').get_text()
Out[45]:
'High: 73 °F'
In [46]:
containers[0].find('img')['title']
Out[46]:
'This Afternoon: Sunny, with a high near 73. West wind 6 to 9 mph. '
In [47]:
periods=[]
short_desc=[]
temp=[]
long_desc=[]
for x in range(len(containers)):
periods.append(containers[x].find('p','period-name').get_text())
short_desc.append(containers[x].find('p','short-desc').get_text())
temp.append(containers[x].find('p','temp').get_text())
long_desc.append(containers[x].find('img')['title'])
In [48]:
periods
Out[48]:
['ThisAfternoon',
'Tonight',
'Thursday',
'ThursdayNight',
'Friday',
'FridayNight',
'Saturday',
'SaturdayNight',
'Sunday']
In [49]:
short_desc
Out[49]:
['Sunny',
'Partly Cloudy',
'BecomingSunny',
'Mostly Clear',
'Sunny',
'Partly Cloudy',
'Mostly Sunny',
'Mostly Clear',
'Sunny']
In [50]:
temp
Out[50]:
['High: 73 °F',
'Low: 51 °F',
'High: 77 °F',
'Low: 51 °F',
'High: 71 °F',
'Low: 50 °F',
'High: 69 °F',
'Low: 52 °F',
'High: 66 °F']
In [51]:
long_desc
Out[51]:
['This Afternoon: Sunny, with a high near 73. West wind 6 to 9 mph. ',
'Tonight: Partly cloudy, with a low around 51. West southwest wind 3 to
8 mph. ',
'Thursday: Partly sunny, then gradually becoming sunny, with a high near
77. Calm wind becoming west 5 to 7 mph in the morning. ',
'Thursday Night: Mostly clear, with a low around 51. West southwest wind
5 to 10 mph. ',
'Friday: Sunny, with a high near 71. West wind 3 to 7 mph. ',
'Friday Night: Partly cloudy, with a low around 50.',
'Saturday: Mostly sunny, with a high near 69.',
'Saturday Night: Mostly clear, with a low around 52.',
'Sunday: Sunny, with a high near 66.']
In [52]:
df=pd.DataFrame({'Periods':periods,
'Short_desc':short_desc,
'Temperature':temp,
'Long_desc':long_desc})
In [53]:
df
Out[53]:
Periods Short_desc Temperature Long_desc
0 ThisAfternoon Sunny High: 73 °F This Afternoon: Sunny, with a high near 73. We...
1 Tonight Partly Cloudy Low: 51 °F Tonight: Partly cloudy, with a low around 51. ...
2 Thursday BecomingSunny High: 77 °F Thursday: Partly sunny, then gradually becomin...
3 ThursdayNight Mostly Clear Low: 51 °F Thursday Night: Mostly clear, with a low aroun...
4 Friday Sunny High: 71 °F Friday: Sunny, with a high near 71. West wind ...
5 FridayNight Partly Cloudy Low: 50 °F Friday Night: Partly cloudy, with a low around...
6 Saturday Mostly Sunny High: 69 °F Saturday: Mostly sunny, with a high near 69.
7 SaturdayNight Mostly Clear Low: 52 °F Saturday Night: Mostly clear, with a low aroun...
8 Sunday Sunny High: 66 °F Sunday: Sunny, with a high near 66.
In [54]:
#Save the table for further analysis on your local desktop.

df.to_csv('/Users/muskansharma/Desktop/weather.csv',index=False)

Web Scraping Using Beautiful Soup

Uploaded by

Copyright:

Available Formats

You might also like

Web Scraping Using Beautiful Soup

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Web Scraping Using Beautiful Soup

Uploaded by

Copyright:

Available Formats

27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook

WEB SCRAPING USING BEAUTIFUL SOUP

## convert/parser the above str data to html type

from bs4 import BeautifulSoup

<html><head><title>The Dormouse's story</title></head>

<title>The Dormouse's story</title>

"The Dormouse's story"

<p class="title"><b>The Dormouse's story</b></p>

#to get the text as inside p there is a b tag

"The Dormouse's story"

<p class="title"><b>The Dormouse's story</b></p>

[<p class="title"><b>The Dormouse's story</b></p>,

#to find specific text based on class or id will use find

#to find all p tag with class=story

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

#you can specify the attribute you want to get

page.status #200 means site working perfectly

<table class="wikitable sortable">

Administrative/Executive Judicial Year Fo

3 Bihar Patna Patna Patna 1950

4 Chhattisgarh Raipur[c] Raipur Bilaspur 2000

5 Goa Panaji[d] Porvorim Mumbai 1987

7 Haryana Chandigarh Chandigarh Chandigarh 1966

9 Jharkhand Ranchi Ranchi Ranchi 2000

11 Kerala Thiruvananthapuram Thiruvananthapuram Ernakulam 1956

14 Manipur Imphal Imphal Imphal 1972

15 Meghalaya Shillong Shillong Shillong 1972

16 Mizoram Aizawl Aizawl Guwahati 1987

17 Nagaland Kohima Kohima Guwahati 1963

18 Odisha Bhubaneswar Bhubaneswar Cuttack 1950

19 Punjab Chandigarh Chandigarh Chandigarh 1966

20 Rajasthan Jaipur Jaipur Jodhpur 1950

21 Sikkim Gangtok[f] Gangtok Gangtok 1975

22 Tamil Nadu Chennai[g] Chennai Chennai 1956

23 Telangana Hyderabad Hyderabad Hyderabad 2014

24 Tripura Agartala Agartala Agartala 1972

27 West Bengal Kolkata Kolkata Kolkata 1950

#2nd method when no table tag is there.

<div class="panel panel-default" id="seven-day-forecast">

#first item of list

Periods Short_desc Temperature Long_desc

2 Thursday BecomingSunny High: 77 °F Thursday: Partly sunny, then gradually becomin...

8 Sunday Sunny High: 66 °F Sunday: Sunny, with a high near 66.

#Save the table for further analysis on your local desktop.

You might also like