Professional Documents
Culture Documents
Web Scraping Project
Web Scraping Project
In [ ]: import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import bs4
from bs4 import BeautifulSoup
title_list=[]
company_list=[]
location_list=[]
location_list_temp=[]
location_list_city=[]
time_of_posting_list=[]
experince_required_list=[]
#This user input is of the field in which the user wants to search for a job
job_field=input("Enter the field in which job is required : ")
job_field.replace(" ","%20")
#This user input is for the preferred location of job of the user
job_location= input("Enter the preffered location of job (You can enter a specific state city can leave
blank for pan India jobs): ")
job_location.replace("","india")
#This part of code would open an automated chrome window and get the url based on users above given inp
uts
driver = webdriver.Chrome("K:\\chromedriver.exe")
url="https://www.linkedin.com/jobs/search/?geoId=102713980&keywords="+job_field+"&location="+job_locati
on+"&start=25"
driver.get(url)
time.sleep(5)
'''
This for loop automates the scrolling on the linkein job result page as linkedin has an infinite scroll
system we have to scroll down 7 times to scrape the data of the whole page.
'''
for i in range(1,8):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
page=requests.get(url)
'''
This for loop is for getting the data of each job posted on the page.
There are total 173 data available in a single linked in page so this for loop iterates 173 times.
'''
for i in range(1,174):
'''
This code finds the title of the company throught xpath.As we observed a pattern in the xpath we tw
eaked the
xpath and iterated it with i variable so that title of all the jobs could be obtained and appended
in a list.
Below title similar logic is applied for company,location and time of posting
'''
#This try and except code is used for the MISSING VALUES. if any they will get mentioned as "Not Sp
ecified"
try:
title=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/div
[1]/h3").text
title_list.append(title)
except NoSuchElementException:
continue
try:
company=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/div
[1]/h4").text
company_list.append(company)
except NoSuchElementException:
company_list.append("Not Specified")
try:
location=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/di
v[1]/div/span").text
location_list_temp.append(location)
except NoSuchElementException:
location_list_temp.append("Not Specified")
try:
time_of_posting=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i
)+"]/div[1]/div/time").text
time_of_posting_list.append(time_of_posting)
except NoSuchElementException:
time_of_posting_list.append("Not Specified")
'''
As there was no experience required mentioned on the linkedin site we appended not specified for
the experienced required column as other sites had it mentioned
'''
experince_required_list.append("Not Specified")
#to get just the city name from the whole string
location_list_city=[word.split(',')[0] for word in location_list_temp]
for city in location_list_city:
location_list.append(city)
#This code used pandas library and makes dataframe from all the list obtained above
job_data = pd.DataFrame({"Job_Title":title_list,"Company":company_list,"Location":location_list,"Time o
f Posting":time_of_posting_list,"Experince_Required":experince_required_list})
print(job_data)
In [ ]: 2. NAUKRI
for job in jobs: #for looping through the job posts on the current page
result = job.get_attribute('innerHTML')
soup = BeautifulSoup(result, 'html.parser') #Selenium hands of the source of the specific job p
age to Beautiful Soup
try:
title.append(soup.find("a",class_='title').text) #find the class name in html and fetch the
text using soup
except NoSuchElementException: # defining exception if element on the html page not found
continue #go to next iteration of for loop
except: #exception for AttributeError
continue
try:
company.append(soup.find("a",class_='subTitle').text) #find the class name in html and fetc
h the text using soup
except NoSuchElementException:
company.append('Not Specified')
except:
company.append('Not Specified')
try:
span_data = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi experience'}) #find
li tag in html with specified class name
experience.append(span_data.find('span').text) #find the span tag in the li tag and fetch t
he text
except NoSuchElementException:
experience.append('Not Specified')
try:
span_data_sal = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi salary'}) #find
li tag in html with specified class name
salary.append(span_data_sal.find('span').text) #find the span tag in the li tag and fetch t
he text
except NoSuchElementException:
salary.append('Not Specified')
try:
span_data_loc = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi location'}) #fin
d li tag in html with specified class name
location.append(span_data_loc.find('span').text) #find the span tag in li tag and fetch the
text
except NoSuchElementException:
location.append('Not Specified')
try:
job_post = soup.select('.jobTupleFooter .type') #selecting the html with the class names sp
ecified in hierarchial way
try:
job_posted.append(job_post[1].find('span').text) #get the text for the second span tag
except:
job_posted.append(job_post[0].find('span').text) #the exception where there was only on
e span in the specified class of html
except NoSuchElementException:
job_posted.append('Not Specified')
#create dataframe by combining all individual lists and get the tabular data
naukri_job_data = pd.DataFrame({"Job_Title":title, "Company":company, "Location":location, "Job_Experie
nce":experience, "Salary":salary, "Job_posted":job_posted})
In [ ]: 3. MONSTER
driver = webdriver.Chrome("K:\\chromedriver.exe")
#This code is to ask users about the field of jobs they want to search for
z=input("enter field:")
zzz=z.replace(" ","%20")
url="https://www.monsterindia.com/srp/results?start=25&sort=1&limit=25&query="+str(zzz)+"&searchId=f780
cd3a-baf2-402b-9d0f-a390efa69eda"
driver.get(url)
time.sleep(5)
#Code to find number of pages so that data could be scraped from all the pages
'''page_no_text = driver.find_element_by_xpath("//*[@id='root']/div[4]/div[2]/section[2]/div[1]/span").
text
print(page_no_text)
page_extract=page_no_text.split()
total_result = int(page_extract[-1])
total_pages = round(total_result/20)'''
l1=[1,2,3,4,6,7,9,10,12,13,14,15,17,18,19,20,22,25,26,28,29,30,31,33]
#The for loop is to scrape the data from all the results on the page
for i in l1:
#This try and except code is used for the MISSING VALUES. if any they will get mentioned as "No
t Specified"
try:
#below we extract the title of the result by using its XPath in the code below
title = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div[2]/di
v/div['+str(i)+']/div/div[1]/div/div/h3/a').text
#below we append it into our list which we made earlier
Job_Title.append(title)
except NoSuchElementException:
continue
try:
company = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div[2]/
div/div['+str(i)+']/div/div[1]/div/div/span/a').text
Job_Company.append(company)
except NoSuchElementException:
Job_Company.append("Not Specified")
try:
location = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[1]/div/div/div/div[1]/span/small').text
Location.append(location)
except NoSuchElementException:
Location.append("Not Specified")
try:
experience = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[1]/div/div/div/div[2]/span/small').text
Experience_Required.append(experience)
except NoSuchElementException:
Experience_Required.append("Not Specified")
try:
jobposting = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[2]/div[1]/span').text
if jobposting=="":
jobposting=driver.find_element_by_xpath("//*[@id='srp-right-part']/div/div[1]/div[1]/d
iv[2]/div/div["+str(i)+"]/div/div[2]/div[1]/span[2]").text
Job_Posting_date.append(jobposting)
except NoSuchElementException:
Job_Posting_date.append("Not Specified")
In [ ]: 4. INDEED
In [ ]: import csv
import pandas as pd
import time
import requests
import bs4
from bs4 import BeautifulSoup
l1=[]
l2=[]
l3=[]
l4=[]
l5=[]
l6=[]
location_list =[]
url = 'https://www.indeed.co.in/jobs?q=data+scientist&l='
time.sleep(5)
#using beatuful soup parsing the html code to get parse tree
soup = BeautifulSoup(page.content,"html.parser")
#finding the tag which contains company name information and assinging it to a variable
company = soup.findAll("span", {"class":"company"})
#finding the tag which contains job location information and assinging it to a variable
location = soup.select('.location')
#finding the tag which contains salary offered information and assinging it to a variable
salary = soup.findAll('span',{"class":'salaryText'})
#finding the tag which contains information when was this job opening was posted and assinging it to a
variable
post_date = soup.findAll('span',{"class":'date'})
#finding the tag which contains information regading the desingnation and assinging it to a variable
title = soup.findAll('a',{"class":"jobtitle turnstileLink"})
#loop statements to iterate through all the job posts till page 8
for k in range(1,8):
url = 'https://www.indeed.co.in/jobs?q=data+scientist&l='+str(k)
page = requests.get(url)
time.sleep(5)
# loops to iterate through all the jobs in a webpage and and appending the required information to list
for c in company:
l1.append(c.text)
c=c.findNext('a')
for t in title:
l2.append(t.text)
t=t.findNext('a')
try:
for l in location:
l3.append(l.text)
try:
l=l.findNext('span')
except:
l=l.findNext('div')
except NoSuchElementException:
l3.append('not speicfied')
except:
l3.append('not specified')
for p in post_date:
l6.append(p.text)
p=p.findNext('span')
#to get just the city name from the whole string
location_list_city=[word.split(',')[0] for word in l3]
for city in location_list_city:
location_list.append(city)
print(job_data)
In [ ]: 5. GLASSDOOR
#Creating the lists for the desired paramaters that we are scraping from the website
Company_Name=[]
Job_Position=[]
Location=[]
Time=[]
Experience_Required=[]
#for loop to fetch the name of the company from page1
for i in range(1,31):
company=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[1]/a/s
pan").text
Company_Name.append(company)
timepost=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[2]/di
v/div[2]").text
Time.append(timepost)
position=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/a/spa
n").text
Job_Position.append(position)
location=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div
[2]/span").text
Location.append(location)