Professional Documents
Culture Documents
Web Scraping Using Beautiful Soup
Web Scraping Using Beautiful Soup
Web Scraping Using Beautiful Soup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
In [3]:
In [4]:
soup=BeautifulSoup(html_doc,'html.parser')
In [5]:
soup
Out[5]:
In [6]:
type(soup)
Out[6]:
bs4.BeautifulSoup
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 1/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [7]:
soup.title
Out[7]:
In [8]:
soup.title.get_text()
Out[8]:
In [9]:
soup.p
Out[9]:
In [10]:
Out[10]:
In [11]:
soup.find('p')
Out[11]:
In [12]:
soup.find_all('p')
Out[12]:
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 2/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [13]:
Out[13]:
<p class="story">Once upon a time there were three little sisters; and th
eir names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> an
d
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
In [14]:
Out[14]:
[<p class="story">Once upon a time there were three little sisters; and t
heir names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> a
nd
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a
>;
and they lived at the bottom of a well.</p>,
<p class="story">...</p>]
In [15]:
soup.find('a')
Out[15]:
In [16]:
soup.find_all('a')
Out[16]:
In [17]:
#you can fetch a particular data by specifying the id(as id is always unique)
soup.find(id='link2')
Out[17]:
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 3/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [18]:
soup.find(id='link2').get_text()
Out[18]:
'Lacie'
In [19]:
Out[19]:
'http://example.com/lacie'
In [20]:
"https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"
Out[20]:
'https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals
_in_India'
In [21]:
import urllib.request
In [22]:
page=urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_state_and_union_terr
In [23]:
page
Out[23]:
<http.client.HTTPResponse at 0x7fb671798910>
In [24]:
Out[24]:
200
In [25]:
soup=BeautifulSoup(page, 'html.parser')
In [26]:
#For scraping the data you need to go on actual website and inspect the element to find
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 4/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [27]:
table=soup.find('table','wikitable sortable')
In [28]:
table
Out[28]:
In [29]:
#if your data is in table tag you don't need to call any other method, work directly
In [30]:
import pandas as pd
df=pd.read_html(str(table))
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 5/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [31]:
df[0]
Out[31]:
Andhra Hyderab
0 Amaravati Amaravati Amaravati 2017
Pradesh (1956–2
Arunachal
1 Itanagar Itanagar Guwahati 1987
Pradesh
Shillo
2 Assam Dispur Dispur Guwahati 1972
(1950–
Ahmed
6 Gujarat Gandhinagar Gandhinagar Ahmedabad 1970
(1960–
Shimla
Himachal
8 Shimla (Summer)Dharamshala Shimla 1971
Pradesh
(Winter)[5]
Bangalore (Summer)
10 Karnataka Bangalore Bangalore 1956
Belgaum (Winter)
Madhya
12 Bhopal Bhopal Jabalpur 1956
Pradesh
Mumbai
13 Maharashtra Mumbai[e] (Summer)Nagpur Mumbai 1960
(Winter)
Uttar
25 Lucknow Lucknow Prayagraj 1950
Pradesh
Bhararisain (summer)
26 Uttarakhand Dehradun Nainital 2000
[7]Dehradun (winter)
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 6/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [32]:
In [33]:
import requests
In [34]:
page=requests.get("http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168"
In [35]:
page.status_code
Out[35]:
200
In [36]:
page
Out[36]:
<Response [200]>
In [37]:
soup=BeautifulSoup(page.content,'html.parser')
In [38]:
soup
Out[38]:
<!DOCTYPE html>
<html class="no-js">
<head>
<!-- Meta -->
<meta content="width=device-width" name="viewport"/>
<link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/><title>Nat
ional Weather Service</title><meta content="National Weather Service" nam
e="DC.title"><meta content="NOAA National Weather Service National Weathe
r Service" name="DC.description"/><meta content="US Department of Commerc
e, NOAA, National Weather Service" name="DC.creator"/><meta content="" na
me="DC.date.created" scheme="ISO8601"/><meta content="EN-US" name="DC.lan
guage" scheme="DCTERMS.RFC1766"/><meta content="weather, National Weather
Service" name="DC.keywords"/><meta content="NOAA's National Weather Servi
ce" name="DC.publisher"/><meta content="National Weather Service" name="D
C.contributor"/><meta content="//www.weather.gov/disclaimer.php" name="D
C.rights"/><meta content="General" name="rating"/><meta content="index,fo
llow" name="robots"/>
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 7/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [39]:
#here seven-day-forecast is the id found by inspecting the html elements for extracting
seven_day=soup.find(id="seven-day-forecast")
seven_day
Out[39]:
In [41]:
#here tombstone-container is the div class found by inspecting the html elements for ex
containers=seven_day.find_all('div','tombstone-container')
containers
Out[41]:
[<div class="tombstone-container">
<p class="period-name">This<br/>Afternoon</p>
<p><img alt="This Afternoon: Sunny, with a high near 73. West wind 6 to
9 mph. " class="forecast-icon" src="newimages/medium/few.png" title="This
Afternoon: Sunny, with a high near 73. West wind 6 to 9 mph. "/></p><p cl
ass="short-desc">Sunny</p><p class="temp temp-high">High: 73 °F</p></div
>,
<div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Partly cloudy, with a low around 51. West southwes
t wind 3 to 8 mph. " class="forecast-icon" src="newimages/medium/nsct.pn
g" title="Tonight: Partly cloudy, with a low around 51. West southwest wi
nd 3 to 8 mph. "/></p><p class="short-desc">Partly Cloudy</p><p class="te
mp temp-low">Low: 51 °F</p></div>,
<div class="tombstone-container">
<p class="period-name">Thursday<br/><br/></p>
<p><img alt="Thursday: Partly sunny, then gradually becoming sunny, with
a high near 77. Calm wind becoming west 5 to 7 mph in the morning. " clas
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 8/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [42]:
Out[42]:
<div class="tombstone-container">
<p class="period-name">This<br/>Afternoon</p>
<p><img alt="This Afternoon: Sunny, with a high near 73. West wind 6 to 9
mph. " class="forecast-icon" src="newimages/medium/few.png" title="This A
fternoon: Sunny, with a high near 73. West wind 6 to 9 mph. "/></p><p cla
ss="short-desc">Sunny</p><p class="temp temp-high">High: 73 °F</p></div>
In [43]:
containers[0].find('p','period-name').get_text()
Out[43]:
'ThisAfternoon'
In [44]:
containers[0].find('p','short-desc').get_text()
Out[44]:
'Sunny'
In [45]:
containers[0].find('p','temp').get_text()
Out[45]:
'High: 73 °F'
In [46]:
containers[0].find('img')['title']
Out[46]:
'This Afternoon: Sunny, with a high near 73. West wind 6 to 9 mph. '
In [47]:
periods=[]
short_desc=[]
temp=[]
long_desc=[]
for x in range(len(containers)):
periods.append(containers[x].find('p','period-name').get_text())
short_desc.append(containers[x].find('p','short-desc').get_text())
temp.append(containers[x].find('p','temp').get_text())
long_desc.append(containers[x].find('img')['title'])
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 9/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [48]:
periods
Out[48]:
['ThisAfternoon',
'Tonight',
'Thursday',
'ThursdayNight',
'Friday',
'FridayNight',
'Saturday',
'SaturdayNight',
'Sunday']
In [49]:
short_desc
Out[49]:
['Sunny',
'Partly Cloudy',
'BecomingSunny',
'Mostly Clear',
'Sunny',
'Partly Cloudy',
'Mostly Sunny',
'Mostly Clear',
'Sunny']
In [50]:
temp
Out[50]:
['High: 73 °F',
'Low: 51 °F',
'High: 77 °F',
'Low: 51 °F',
'High: 71 °F',
'Low: 50 °F',
'High: 69 °F',
'Low: 52 °F',
'High: 66 °F']
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 10/11
27/04/2023, 02:20 web-scraping:BeautifulSoup - Jupyter Notebook
In [51]:
long_desc
Out[51]:
['This Afternoon: Sunny, with a high near 73. West wind 6 to 9 mph. ',
'Tonight: Partly cloudy, with a low around 51. West southwest wind 3 to
8 mph. ',
'Thursday: Partly sunny, then gradually becoming sunny, with a high near
77. Calm wind becoming west 5 to 7 mph in the morning. ',
'Thursday Night: Mostly clear, with a low around 51. West southwest wind
5 to 10 mph. ',
'Friday: Sunny, with a high near 71. West wind 3 to 7 mph. ',
'Friday Night: Partly cloudy, with a low around 50.',
'Saturday: Mostly sunny, with a high near 69.',
'Saturday Night: Mostly clear, with a low around 52.',
'Sunday: Sunny, with a high near 66.']
In [52]:
df=pd.DataFrame({'Periods':periods,
'Short_desc':short_desc,
'Temperature':temp,
'Long_desc':long_desc})
In [53]:
df
Out[53]:
0 ThisAfternoon Sunny High: 73 °F This Afternoon: Sunny, with a high near 73. We...
1 Tonight Partly Cloudy Low: 51 °F Tonight: Partly cloudy, with a low around 51. ...
3 ThursdayNight Mostly Clear Low: 51 °F Thursday Night: Mostly clear, with a low aroun...
4 Friday Sunny High: 71 °F Friday: Sunny, with a high near 71. West wind ...
5 FridayNight Partly Cloudy Low: 50 °F Friday Night: Partly cloudy, with a low around...
6 Saturday Mostly Sunny High: 69 °F Saturday: Mostly sunny, with a high near 69.
7 SaturdayNight Mostly Clear Low: 52 °F Saturday Night: Mostly clear, with a low aroun...
In [54]:
localhost:8890/notebooks/web-scraping%3ABeautifulSoup.ipynb#convert/parser-the-above-str-data-to-html-type 11/11