Web Scraping Project

In [ ]: 1.
LINKEDIN
In [ ]: import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import bs4
from bs4 import BeautifulSoup
title_list=[]
company_list=[]
location_list=[]
location_list_temp=[]
location_list_city=[]
time_of_posting_list=[]
experince_required_list=[]
#This user input is of the field in which the user wants to search for a job
job_field=input("Enter the field in which job is required : ")
job_field.replace(" ","%20")
#This user input is for the preferred location of job of the user
job_location= input("Enter the preffered location of job (You can enter a specific state city can leave
blank for pan India jobs): ")
job_location.replace("","india")
#This part of code would open an automated chrome window and get the url based on users above given inp
uts
driver = webdriver.Chrome("K:\\chromedriver.exe")
url="https://www.linkedin.com/jobs/search/?geoId=102713980&keywords="+job_field+"&location="+job_locati
on+"&start=25"
driver.get(url)
time.sleep(5)
'''
This for loop automates the scrolling on the linkein job result page as linkedin has an infinite scroll
system we have to scroll down 7 times to scrape the data of the whole page.
'''
for i in range(1,8):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
page=requests.get(url)
'''
This for loop is for getting the data of each job posted on the page.
There are total 173 data available in a single linked in page so this for loop iterates 173 times.
'''
'''
This code finds the title of the company throught xpath.As we observed a pattern in the xpath we tw
eaked the
xpath and iterated it with i variable so that title of all the jobs could be obtained and appended
in a list.
Below title similar logic is applied for company,location and time of posting
'''
#This try and except code is used for the MISSING VALUES. if any they will get mentioned as "Not Sp
ecified"
try:
title=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/div
[1]/h3").text
title_list.append(title)
except NoSuchElementException:
continue
try:
company=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/div
[1]/h4").text
company_list.append(company)
company_list.append("Not Specified")
try:
location=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/di
v[1]/div/span").text
location_list_temp.append(location)
location_list_temp.append("Not Specified")
try:
time_of_posting=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i
)+"]/div[1]/div/time").text
time_of_posting_list.append(time_of_posting)
time_of_posting_list.append("Not Specified")
'''
As there was no experience required mentioned on the linkedin site we appended not specified for
the experienced required column as other sites had it mentioned
'''
experince_required_list.append("Not Specified")
#to get just the city name from the whole string
location_list_city=[word.split(',')[0] for word in location_list_temp]
for city in location_list_city:
location_list.append(city)
#This code used pandas library and makes dataframe from all the list obtained above
job_data = pd.DataFrame({"Job_Title":title_list,"Company":company_list,"Location":location_list,"Time o
f Posting":time_of_posting_list,"Experince_Required":experince_required_list})
print(job_data)
#Convert data frame into csv

#job_data.to_csv("Linkedin_Jobs_DataScience.csv")
In [ ]: 2. NAUKRI
In [ ]: from selenium import webdriver

import pandas as pd
#defining the lists

title = []
company = []
experience = []
salary = []
location = []
job_posted = []
# Create instance of google chrome to open url in Chrome

driver = webdriver.Chrome("C:/Users/prachi/chromedriver")
for i in range(1,15): #for looping through the pages of the url

driver.get("https://www.naukri.com/data-science-jobs-"+str(i)) #url of naukri site for fetching dat
a
driver.implicitly_wait(4)
jobs = driver.find_elements_by_class_name('jobTuple') #selenium driver finds the class name in html
code
for job in jobs: #for looping through the job posts on the current page
result = job.get_attribute('innerHTML')
soup = BeautifulSoup(result, 'html.parser') #Selenium hands of the source of the specific job p
age to Beautiful Soup
try:
title.append(soup.find("a",class_='title').text) #find the class name in html and fetch the
text using soup
except NoSuchElementException: # defining exception if element on the html page not found
continue #go to next iteration of for loop
except: #exception for AttributeError
continue
try:
company.append(soup.find("a",class_='subTitle').text) #find the class name in html and fetc
h the text using soup
company.append('Not Specified')
except:
company.append('Not Specified')
try:
span_data = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi experience'}) #find
li tag in html with specified class name
experience.append(span_data.find('span').text) #find the span tag in the li tag and fetch t
he text
experience.append('Not Specified')
try:
span_data_sal = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi salary'}) #find
li tag in html with specified class name
salary.append(span_data_sal.find('span').text) #find the span tag in the li tag and fetch t
he text
salary.append('Not Specified')
try:
span_data_loc = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi location'}) #fin
d li tag in html with specified class name
location.append(span_data_loc.find('span').text) #find the span tag in li tag and fetch the
text
location.append('Not Specified')
try:
job_post = soup.select('.jobTupleFooter .type') #selecting the html with the class names sp
ecified in hierarchial way
try:
job_posted.append(job_post[1].find('span').text) #get the text for the second span tag
except:
job_posted.append(job_post[0].find('span').text) #the exception where there was only on
e span in the specified class of html
job_posted.append('Not Specified')
#create dataframe by combining all individual lists and get the tabular data
naukri_job_data = pd.DataFrame({"Job_Title":title, "Company":company, "Location":location, "Job_Experie
nce":experience, "Salary":salary, "Job_posted":job_posted})
naukri_job_data.to_csv('Naukri.csv') #write the dataframe to csv file
In [ ]: 3. MONSTER
In [ ]: from selenium import webdriver

import time
import requests
import pandas as pd
'''below are the 5 fields of which we're going to extract data

We've created empty lists of the same'''
Job_Title=[]
Job_Company=[]
Experience_Required=[]
Location=[]
Job_Posting_date=[]
driver = webdriver.Chrome("K:\\chromedriver.exe")
#This code is to ask users about the field of jobs they want to search for
z=input("enter field:")
zzz=z.replace(" ","%20")
url="https://www.monsterindia.com/srp/results?start=25&sort=1&limit=25&query="+str(zzz)+"&searchId=f780
cd3a-baf2-402b-9d0f-a390efa69eda"
driver.get(url)
time.sleep(5)
#Code to find number of pages so that data could be scraped from all the pages
'''page_no_text = driver.find_element_by_xpath("//*[@id='root']/div[4]/div[2]/section[2]/div[1]/span").
text
print(page_no_text)
page_extract=page_no_text.split()
total_result = int(page_extract[-1])
total_pages = round(total_result/20)'''
'''The list is because some number of elements were deliberately removed

from the html code so there would be no definite pattern to scrape the web page easily.'''
l1=[1,2,3,4,6,7,9,10,12,13,14,15,17,18,19,20,22,25,26,28,29,30,31,33]
#The for loop is to scrape data from all the pages

for k in range(25,200,25):
url="https://www.monsterindia.com/srp/results?start="+str(k)+"&sort=1&limit=25&query="+str(zzz)+"&s
earchId=f780cd3a-baf2-402b-9d0f-a390efa69eda"
driver.get(url)
time.sleep(5)
#The for loop is to scrape the data from all the results on the page
for i in l1:
#This try and except code is used for the MISSING VALUES. if any they will get mentioned as "No
t Specified"
try:
#below we extract the title of the result by using its XPath in the code below
title = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div[2]/di
v/div['+str(i)+']/div/div[1]/div/div/h3/a').text
#below we append it into our list which we made earlier
Job_Title.append(title)
continue
try:
company = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div[2]/
div/div['+str(i)+']/div/div[1]/div/div/span/a').text
Job_Company.append(company)
Job_Company.append("Not Specified")
try:
location = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[1]/div/div/div/div[1]/span/small').text
Location.append(location)
Location.append("Not Specified")
try:
experience = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[1]/div/div/div/div[2]/span/small').text
Experience_Required.append(experience)
Experience_Required.append("Not Specified")
try:
jobposting = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[2]/div[1]/span').text
if jobposting=="":
jobposting=driver.find_element_by_xpath("//*[@id='srp-right-part']/div/div[1]/div[1]/d
iv[2]/div/div["+str(i)+"]/div/div[2]/div[1]/span[2]").text
Job_Posting_date.append(jobposting)
Job_Posting_date.append("Not Specified")
Job_Posting_date = [x.replace('Posted:', '') for x in Job_Posting_date]
#Below we form a dataframe of the 5 fields we collected.

#This code used pandas library and makes dataframe from all the list obtained above
job_data1 = pd.DataFrame({"Job_Title":Job_Title,"Company":Job_Company,"Location":Location, "Experience_
Required":Experience_Required,"Job_Posting_Date":Job_Posting_date})
print(job_data1)
#to convert dataframe into csv

job_data1.to_csv("monster.csv")
In [ ]: 4. INDEED
In [ ]: import csv
import pandas as pd
import time
import requests
import bs4
l1=[]
l2=[]
l3=[]
l4=[]
l5=[]
l6=[]
location_list =[]
url = 'https://www.indeed.co.in/jobs?q=data+scientist&l='
time.sleep(5)
#retriving the data from url

page = requests.get(url)
#using beatuful soup parsing the html code to get parse tree
soup = BeautifulSoup(page.content,"html.parser")
#finding the tag which contains company name information and assinging it to a variable
company = soup.findAll("span", {"class":"company"})
#finding the tag which contains job location information and assinging it to a variable
location = soup.select('.location')
#finding the tag which contains salary offered information and assinging it to a variable
salary = soup.findAll('span',{"class":'salaryText'})
#finding the tag which contains information when was this job opening was posted and assinging it to a
variable
post_date = soup.findAll('span',{"class":'date'})
#finding the tag which contains information regading the desingnation and assinging it to a variable
title = soup.findAll('a',{"class":"jobtitle turnstileLink"})
#loop statements to iterate through all the job posts till page 8
for k in range(1,8):
url = 'https://www.indeed.co.in/jobs?q=data+scientist&l='+str(k)
page = requests.get(url)
time.sleep(5)
# loops to iterate through all the jobs in a webpage and and appending the required information to list
for c in company:
l1.append(c.text)
c=c.findNext('a')
for t in title:
l2.append(t.text)
t=t.findNext('a')
try:
for l in location:
l3.append(l.text)
try:
l=l.findNext('span')
except:
l=l.findNext('div')
l3.append('not speicfied')
except:
l3.append('not specified')
for p in post_date:
l6.append(p.text)
p=p.findNext('span')
#job experience required is not mentioned in indeed.com so setting default value

l5.append("not specified")
#removing extra spaces and character from data

l1 = [x.replace('\n', '') for x in l1]
l1 = [x.replace(' ', '') for x in l1]
#to get just the city name from the whole string
location_list_city=[word.split(',')[0] for word in l3]
for city in location_list_city:
location_list.append(city)
#cmbining lists in data frame

job_data = pd.DataFrame({"Designation":l2,"Company_Name":l1,"Location":location_list,"Experience_Requir
ed":l5,"Post_Date":l6})
print(job_data)
#converting the data frame to a csv file

#job_data.to_csv("indeed.csv")
In [ ]: 5. GLASSDOOR
In [ ]: #importing all the necessary libraries

import requests
import selenium
import xlsxwriter
import time
import pandas
from selenium import webdriver as wb
#storing the location of the chrome web driver
driver=wb.Chrome('C:\Program Files (x86)\chromedriver.exe')
#calling the desired URL
driver.get('https://www.glassdoor.co.in/Job/data-science-jobs-SRCH_KO0,12.htm')
#giving the website time to load
time.sleep(10)
#print(driver.title)
#Creating the lists for the desired paramaters that we are scraping from the website
Company_Name=[]
Job_Position=[]
Location=[]
Time=[]
Experience_Required=[]
#for loop to fetch the name of the company from page1
company=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[1]/a/s
pan").text
Company_Name.append(company)
#for loop to fetch the job position from page1

position=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/a/span").
text
Job_Position.append(position)
#for loop to fetch the job location from page1

location=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[2]/sp
an").text
timepost=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[2]/di
v/div[2]").text
Time.append(timepost)
experience=" Not specified"

#logic to fetch the number of pages in the search results

"""
pages=driver.find_element_by_xpath("//*[@id='ResultsFooter']/div[1]").text
print(pages)
split=pages.split()
print(split)
"""
#logic for the website to change to the next page after scraping the current page
for a in range(2,5):
url='https://www.glassdoor.co.in/Job/data-science-jobs-SRCH_KO0,12_IP'+str(a)+'.htm'
driver.get(url)
#for loops to scrap data from page 2 onwards
company=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div
[1]/a/span").text
Company_Name.append(company)
position=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/a/spa
n").text
Job_Position.append(position)
location=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div
[2]/span").text
timepost = driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li[" + str(i) + "]/div

[2]/div[2]/div/div[2]").text
Time.append(timepost)
experience = " Not specified"

#command to close the browser tab after scraping the data

driver.quit()
Time=[x.replace('d',' days ago') for x in Time]
Time=[x.replace('h',' hours ago') for x in Time]
#creating a data frame to store the values we have scraped

dataframe=pandas.DataFrame({"Company":Company_Name,"Position":Job_Position,"Location":Location,"Post Ti
me":Time,"Experience Required":Experience_Required})
print(dataframe)
#exporting the data to the CSV file

dataframe.to_csv("Job List.csv")
"""
#article=soup.findAll(article id='MainCol')
#print(article)
"""

Web Scraping Project

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Web Scraping Project

Uploaded by

Copyright:

Available Formats

In [ ]: 1.

#Convert data frame into csv

In [ ]: from selenium import webdriver

#defining the lists

# Create instance of google chrome to open url in Chrome

for i in range(1,15): #for looping through the pages of the url

naukri_job_data.to_csv('Naukri.csv') #write the dataframe to csv file

In [ ]: from selenium import webdriver

'''below are the 5 fields of which we're going to extract data

'''The list is because some number of elements were deliberately removed

#The for loop is to scrape data from all the pages

Job_Posting_date = [x.replace('Posted:', '') for x in Job_Posting_date]

#Below we form a dataframe of the 5 fields we collected.

#to convert dataframe into csv

#retriving the data from url

#job experience required is not mentioned in indeed.com so setting default value

#removing extra spaces and character from data

#cmbining lists in data frame

#converting the data frame to a csv file

In [ ]: #importing all the necessary libraries

#for loop to fetch the job position from page1

#for loop to fetch the job location from page1

experience=" Not specified"

#logic to fetch the number of pages in the search results

timepost = driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li[" + str(i) + "]/div

experience = " Not specified"

#command to close the browser tab after scraping the data

#creating a data frame to store the values we have scraped

#exporting the data to the CSV file

You might also like