Download as pdf or txt
Download as pdf or txt
You are on page 1of 4

In [14]: import pandas as pd

data = pd.read_csv('extracted_table_cleaned202403201133.csv')

In [15]: import gender_guesser.detector as gender


def get_gender(data, manual=None):
d = gender.Detector()

def parse_name(name):
return name.split(',')[-1].strip().split(' ')[0]
results = [d.get_gender(parse_name(name)) for name in data['Name']]

# clean results
cleaned_results = []
for index, row in data.iterrows():
name = parse_name(row['Name'])
result = results[index]
if manual and name in manual.keys():
cleaned_results.append(manual[name])
elif result == 'mostly_male':
cleaned_results.append('male')
elif result == 'mostly_female':
cleaned_results.append('female')
elif result not in ['male', 'female']:
print(f"warning, {name} is {result}")
cleaned_results.append(result)
else:
cleaned_results.append(result)

data['gender'] = cleaned_results
return data

In [16]: manual = {
'example': 'female'
}
data= get_gender(data, manual)

In [17]: import matplotlib.pyplot as plt


data['Salary'].plot.hist(bins=30, title='Histogram of All Salaries')
plt.xlabel('$ Salary (USD)')

Out[17]: Text(0.5, 0, '$ Salary (USD)')


In [18]: data['Salary'].describe()

Out[18]: count 66.000000


mean 60091.530303
std 30427.507930
min 33238.000000
25% 44059.000000
50% 49841.000000
75% 69758.000000
max 217782.000000
Name: Salary, dtype: float64

In [19]: data_m = data[data['gender'] == 'male']


data_m['Salary'].plot.hist(bins=30, title='Histogram of Male Salaries')
plt.xlabel('$ Salary (USD)')

Out[19]: Text(0.5, 0, '$ Salary (USD)')


In [20]: data_m.describe()

Out[20]: Salary

count 28.000000

mean 67355.035714

std 42287.350621

min 39562.000000

25% 44059.000000

50% 56551.000000

75% 71848.000000

max 217782.000000

In [21]: data_f = data[data['gender'] == 'female']


data_f['Salary'].plot.hist(bins=30, title='Histogram of Female Salaries')
plt.xlabel('$ Salary (USD)')

Out[21]: Text(0.5, 0, '$ Salary (USD)')


In [22]: data_f.describe()

Out[22]: Salary

count 38.000000

mean 54739.473684

std 15880.882804

min 33238.000000

25% 42926.500000

50% 49841.000000

75% 64508.000000

max 97562.000000

In [23]: percent_gender_disparity = ((data_m['Salary'].mean() - data_f['Salary'].mean

In [24]: print(f'Calculated Gender disparity: {percent_gender_disparity}%')

Calculated Gender disparity: 23.046553393724196%

In [ ]: !jupyter nbconvert --to html opradata20240320.ipynb

In [ ]:

You might also like