Download as pdf or txt
Download as pdf or txt
You are on page 1of 1

In [ ]: 1.

LINKEDIN

In [ ]: import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import bs4
from bs4 import BeautifulSoup

title_list=[]
company_list=[]
location_list=[]
location_list_temp=[]
location_list_city=[]
time_of_posting_list=[]
experince_required_list=[]

#This user input is of the field in which the user wants to search for a job
job_field=input("Enter the field in which job is required : ")
job_field.replace(" ","%20")

#This user input is for the preferred location of job of the user
job_location= input("Enter the preffered location of job (You can enter a specific state city can leave
blank for pan India jobs): ")
job_location.replace("","india")

#This part of code would open an automated chrome window and get the url based on users above given inp
uts
driver = webdriver.Chrome("K:\\chromedriver.exe")
url="https://www.linkedin.com/jobs/search/?geoId=102713980&keywords="+job_field+"&location="+job_locati
on+"&start=25"
driver.get(url)
time.sleep(5)

'''
This for loop automates the scrolling on the linkein job result page as linkedin has an infinite scroll
system we have to scroll down 7 times to scrape the data of the whole page.
'''
for i in range(1,8):
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(5)
page=requests.get(url)

'''
This for loop is for getting the data of each job posted on the page.
There are total 173 data available in a single linked in page so this for loop iterates 173 times.
'''

for i in range(1,174):

'''
This code finds the title of the company throught xpath.As we observed a pattern in the xpath we tw
eaked the
xpath and iterated it with i variable so that title of all the jobs could be obtained and appended
in a list.
Below title similar logic is applied for company,location and time of posting
'''
#This try and except code is used for the MISSING VALUES. if any they will get mentioned as "Not Sp
ecified"
try:
title=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/div
[1]/h3").text
title_list.append(title)
except NoSuchElementException:
continue

try:
company=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/div
[1]/h4").text
company_list.append(company)
except NoSuchElementException:
company_list.append("Not Specified")

try:
location=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i)+"]/di
v[1]/div/span").text
location_list_temp.append(location)
except NoSuchElementException:
location_list_temp.append("Not Specified")

try:
time_of_posting=driver.find_element_by_xpath("//*[@id='main-content']/div/section/ul/li["+str(i
)+"]/div[1]/div/time").text
time_of_posting_list.append(time_of_posting)
except NoSuchElementException:
time_of_posting_list.append("Not Specified")

'''
As there was no experience required mentioned on the linkedin site we appended not specified for
the experienced required column as other sites had it mentioned
'''
experince_required_list.append("Not Specified")

#to get just the city name from the whole string
location_list_city=[word.split(',')[0] for word in location_list_temp]
for city in location_list_city:
location_list.append(city)

#This code used pandas library and makes dataframe from all the list obtained above

job_data = pd.DataFrame({"Job_Title":title_list,"Company":company_list,"Location":location_list,"Time o
f Posting":time_of_posting_list,"Experince_Required":experince_required_list})
print(job_data)

#Convert data frame into csv


#job_data.to_csv("Linkedin_Jobs_DataScience.csv")

In [ ]: 2. NAUKRI

In [ ]: from selenium import webdriver


from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

#defining the lists


title = []
company = []
experience = []
salary = []
location = []
job_posted = []

# Create instance of google chrome to open url in Chrome


driver = webdriver.Chrome("C:/Users/prachi/chromedriver")

for i in range(1,15): #for looping through the pages of the url


driver.get("https://www.naukri.com/data-science-jobs-"+str(i)) #url of naukri site for fetching dat
a
driver.implicitly_wait(4)
jobs = driver.find_elements_by_class_name('jobTuple') #selenium driver finds the class name in html
code

for job in jobs: #for looping through the job posts on the current page
result = job.get_attribute('innerHTML')
soup = BeautifulSoup(result, 'html.parser') #Selenium hands of the source of the specific job p
age to Beautiful Soup

try:
title.append(soup.find("a",class_='title').text) #find the class name in html and fetch the
text using soup
except NoSuchElementException: # defining exception if element on the html page not found
continue #go to next iteration of for loop
except: #exception for AttributeError
continue

try:
company.append(soup.find("a",class_='subTitle').text) #find the class name in html and fetc
h the text using soup
except NoSuchElementException:
company.append('Not Specified')
except:
company.append('Not Specified')

try:
span_data = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi experience'}) #find
li tag in html with specified class name
experience.append(span_data.find('span').text) #find the span tag in the li tag and fetch t
he text
except NoSuchElementException:
experience.append('Not Specified')

try:
span_data_sal = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi salary'}) #find
li tag in html with specified class name
salary.append(span_data_sal.find('span').text) #find the span tag in the li tag and fetch t
he text
except NoSuchElementException:
salary.append('Not Specified')

try:
span_data_loc = soup.find('li',{'class':'fleft grey-text br2 placeHolderLi location'}) #fin
d li tag in html with specified class name
location.append(span_data_loc.find('span').text) #find the span tag in li tag and fetch the
text
except NoSuchElementException:
location.append('Not Specified')

try:
job_post = soup.select('.jobTupleFooter .type') #selecting the html with the class names sp
ecified in hierarchial way
try:
job_posted.append(job_post[1].find('span').text) #get the text for the second span tag
except:
job_posted.append(job_post[0].find('span').text) #the exception where there was only on
e span in the specified class of html

except NoSuchElementException:
job_posted.append('Not Specified')

#create dataframe by combining all individual lists and get the tabular data
naukri_job_data = pd.DataFrame({"Job_Title":title, "Company":company, "Location":location, "Job_Experie
nce":experience, "Salary":salary, "Job_posted":job_posted})

naukri_job_data.to_csv('Naukri.csv') #write the dataframe to csv file

In [ ]: 3. MONSTER

In [ ]: from selenium import webdriver


from selenium.common.exceptions import NoSuchElementException
import time
import requests
import pandas as pd

'''below are the 5 fields of which we're going to extract data


We've created empty lists of the same'''
Job_Title=[]
Job_Company=[]
Experience_Required=[]
Location=[]
Job_Posting_date=[]

driver = webdriver.Chrome("K:\\chromedriver.exe")

#This code is to ask users about the field of jobs they want to search for
z=input("enter field:")
zzz=z.replace(" ","%20")

url="https://www.monsterindia.com/srp/results?start=25&sort=1&limit=25&query="+str(zzz)+"&searchId=f780
cd3a-baf2-402b-9d0f-a390efa69eda"
driver.get(url)
time.sleep(5)

#Code to find number of pages so that data could be scraped from all the pages
'''page_no_text = driver.find_element_by_xpath("//*[@id='root']/div[4]/div[2]/section[2]/div[1]/span").
text
print(page_no_text)
page_extract=page_no_text.split()
total_result = int(page_extract[-1])
total_pages = round(total_result/20)'''

'''The list is because some number of elements were deliberately removed


from the html code so there would be no definite pattern to scrape the web page easily.'''

l1=[1,2,3,4,6,7,9,10,12,13,14,15,17,18,19,20,22,25,26,28,29,30,31,33]

#The for loop is to scrape data from all the pages


for k in range(25,200,25):
url="https://www.monsterindia.com/srp/results?start="+str(k)+"&sort=1&limit=25&query="+str(zzz)+"&s
earchId=f780cd3a-baf2-402b-9d0f-a390efa69eda"
driver.get(url)
time.sleep(5)

#The for loop is to scrape the data from all the results on the page
for i in l1:
#This try and except code is used for the MISSING VALUES. if any they will get mentioned as "No
t Specified"
try:
#below we extract the title of the result by using its XPath in the code below
title = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div[2]/di
v/div['+str(i)+']/div/div[1]/div/div/h3/a').text
#below we append it into our list which we made earlier
Job_Title.append(title)
except NoSuchElementException:
continue
try:
company = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div[2]/
div/div['+str(i)+']/div/div[1]/div/div/span/a').text
Job_Company.append(company)
except NoSuchElementException:
Job_Company.append("Not Specified")
try:
location = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[1]/div/div/div/div[1]/span/small').text
Location.append(location)
except NoSuchElementException:
Location.append("Not Specified")
try:
experience = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[1]/div/div/div/div[2]/span/small').text
Experience_Required.append(experience)
except NoSuchElementException:
Experience_Required.append("Not Specified")
try:
jobposting = driver.find_element_by_xpath('//*[@id="srp-right-part"]/div/div[1]/div[1]/div
[2]/div/div['+str(i)+']/div/div[2]/div[1]/span').text
if jobposting=="":
jobposting=driver.find_element_by_xpath("//*[@id='srp-right-part']/div/div[1]/div[1]/d
iv[2]/div/div["+str(i)+"]/div/div[2]/div[1]/span[2]").text
Job_Posting_date.append(jobposting)
except NoSuchElementException:
Job_Posting_date.append("Not Specified")

Job_Posting_date = [x.replace('Posted:', '') for x in Job_Posting_date]

#Below we form a dataframe of the 5 fields we collected.


#This code used pandas library and makes dataframe from all the list obtained above
job_data1 = pd.DataFrame({"Job_Title":Job_Title,"Company":Job_Company,"Location":Location, "Experience_
Required":Experience_Required,"Job_Posting_Date":Job_Posting_date})
print(job_data1)

#to convert dataframe into csv


job_data1.to_csv("monster.csv")

In [ ]: 4. INDEED

In [ ]: import csv
import pandas as pd
import time
import requests
import bs4
from bs4 import BeautifulSoup

l1=[]
l2=[]
l3=[]
l4=[]
l5=[]
l6=[]
location_list =[]

url = 'https://www.indeed.co.in/jobs?q=data+scientist&l='

time.sleep(5)

#retriving the data from url


page = requests.get(url)

#using beatuful soup parsing the html code to get parse tree
soup = BeautifulSoup(page.content,"html.parser")

#finding the tag which contains company name information and assinging it to a variable
company = soup.findAll("span", {"class":"company"})

#finding the tag which contains job location information and assinging it to a variable
location = soup.select('.location')

#finding the tag which contains salary offered information and assinging it to a variable
salary = soup.findAll('span',{"class":'salaryText'})

#finding the tag which contains information when was this job opening was posted and assinging it to a
variable
post_date = soup.findAll('span',{"class":'date'})

#finding the tag which contains information regading the desingnation and assinging it to a variable
title = soup.findAll('a',{"class":"jobtitle turnstileLink"})

#loop statements to iterate through all the job posts till page 8
for k in range(1,8):
url = 'https://www.indeed.co.in/jobs?q=data+scientist&l='+str(k)
page = requests.get(url)
time.sleep(5)

# loops to iterate through all the jobs in a webpage and and appending the required information to list
for c in company:
l1.append(c.text)
c=c.findNext('a')

for t in title:
l2.append(t.text)
t=t.findNext('a')

try:
for l in location:
l3.append(l.text)
try:
l=l.findNext('span')
except:
l=l.findNext('div')
except NoSuchElementException:
l3.append('not speicfied')
except:
l3.append('not specified')

for p in post_date:
l6.append(p.text)
p=p.findNext('span')

#job experience required is not mentioned in indeed.com so setting default value


for i in range(1,106):
l5.append("not specified")

#removing extra spaces and character from data


l1 = [x.replace('\n', '') for x in l1]
l1 = [x.replace(' ', '') for x in l1]
l4 = [x.replace('\n', '') for x in l4]
l2 = [x.replace('\n', '') for x in l2]

#to get just the city name from the whole string
location_list_city=[word.split(',')[0] for word in l3]
for city in location_list_city:
location_list.append(city)

#cmbining lists in data frame


job_data = pd.DataFrame({"Designation":l2,"Company_Name":l1,"Location":location_list,"Experience_Requir
ed":l5,"Post_Date":l6})

print(job_data)

#converting the data frame to a csv file


#job_data.to_csv("indeed.csv")

In [ ]: 5. GLASSDOOR

In [ ]: #importing all the necessary libraries


from bs4 import BeautifulSoup
import requests
import selenium
import xlsxwriter
import time
import pandas
from selenium import webdriver as wb
#storing the location of the chrome web driver
driver=wb.Chrome('C:\Program Files (x86)\chromedriver.exe')
#calling the desired URL
driver.get('https://www.glassdoor.co.in/Job/data-science-jobs-SRCH_KO0,12.htm')
#giving the website time to load
time.sleep(10)
#print(driver.title)

#Creating the lists for the desired paramaters that we are scraping from the website
Company_Name=[]
Job_Position=[]
Location=[]
Time=[]
Experience_Required=[]
#for loop to fetch the name of the company from page1
for i in range(1,31):
company=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[1]/a/s
pan").text
Company_Name.append(company)

#for loop to fetch the job position from page1


position=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/a/span").
text
Job_Position.append(position)

#for loop to fetch the job location from page1


location=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[2]/sp
an").text
Location.append(location)

timepost=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div[2]/di
v/div[2]").text
Time.append(timepost)

experience=" Not specified"


Experience_Required.append(experience)

#logic to fetch the number of pages in the search results


"""
pages=driver.find_element_by_xpath("//*[@id='ResultsFooter']/div[1]").text
print(pages)
split=pages.split()
print(split)
"""
#logic for the website to change to the next page after scraping the current page
for a in range(2,5):
url='https://www.glassdoor.co.in/Job/data-science-jobs-SRCH_KO0,12_IP'+str(a)+'.htm'
driver.get(url)
#for loops to scrap data from page 2 onwards
for i in range(1,31):
company=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div
[1]/a/span").text
Company_Name.append(company)

position=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/a/spa
n").text
Job_Position.append(position)

location=driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li["+str(i)+"]/div[2]/div
[2]/span").text
Location.append(location)

timepost = driver.find_element_by_xpath("//*[@id='MainCol']/div[1]/ul/li[" + str(i) + "]/div


[2]/div[2]/div/div[2]").text
Time.append(timepost)

experience = " Not specified"


Experience_Required.append(experience)

#command to close the browser tab after scraping the data


driver.quit()
Time=[x.replace('d',' days ago') for x in Time]
Time=[x.replace('h',' hours ago') for x in Time]

#creating a data frame to store the values we have scraped


dataframe=pandas.DataFrame({"Company":Company_Name,"Position":Job_Position,"Location":Location,"Post Ti
me":Time,"Experience Required":Experience_Required})
print(dataframe)

#exporting the data to the CSV file


dataframe.to_csv("Job List.csv")
"""
#article=soup.findAll(article id='MainCol')
#print(article)
"""

You might also like