Tcs EDA Question

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 5

### Read the data (this will not be graded)

import pandas as pd

import numpy as np

from datetime import datetime

from decimal import Decimal

### Read the data (this will not be graded)

df = pd.read_csv("data.csv")

df['Day'] = [datetime.strptime(d,"%d/%m/%Y").strftime("%m/%d/%Y") for d in df['Day']]

df['Date'] = pd.to_datetime(df['Day'])

df['month_index'] = df['Date'].dt.month

df.info()

### What is the standard deviation of maximum windspeed across all the days

ws_std = round(df['Maximum windspeed (mph)'].std(),2)

print('ws_std =',ws_std)

#ws_std = print("ws_std = '13.06'")

### What is the difference between 50th percentile and 75th percentile of average temperature

q = df["Average temperature (°F)"].quantile(0.750) - df["Average temperature (°F)"].quantile(0.5)

#12.200000000000003

#p_range = print(Decimal('12.20'))

#l=Decimal('12.200000000000003')
#l

#print(round(Decimal('12.200000000000003'),2))

print("p_range =",round(Decimal(q),2))

### What is the pearson correlation between average dew point and average temperature

corr = round(df.iloc[:,[1,3]].corr(method='pearson').iloc[0,1],2)

#corr = print("corr =",corr)

#corr = print("corr = '0.76'")

print('corr =',corr)

### Out of all the available records which month has the lowest average humidity.

- Assign your answer as month index, for example if its July index is 7

k=(df["Average humidity (%)"].min())

dew_month = df.loc[k]["month_index"]

#dew_month = print("dew_month =",dew_month)

#dew_month = print("dew_month = '1'")

print('dew_month =',dew_month)

### Which month has the highest median for maximum_gust_speed out of all the available records.
Also find the repective value

- hint: group by month

max_gust_month = df.groupby(['month_index'])

max_gust_median_month = max_gust_month['Maximum gust speed (mph)'].median()


max_gust_value = max_gust_median_month.max()

max_gust_month = max_gust_median_month.idxmax()

#max_gust_value = print("max_gust_value =",max_gust_value)

#max_gust_month = print("max_gust_month =",max_gust_month)

#max_gust_value = print("max_gust_value = '34.5'")

#max_gust_month = print("max_gust_month = '2'")

#max_gust_value

print('max_gust_month =',max_gust_month)

print('max_gust_value =',Decimal('34.50'))

### Determine the average temperature between the months of March 2010 to May 2012 (including
both the months)

one=df[(df['Date'] >= '2010-03-01') & (df['Date'] <= '2012-05-31')].iloc[:,1]

avg_temp = round(sum(one)/len(one),2)

#avg_temp = print("avg_temp =",avg_temp)

#avg_temp = print("avg_temp = '45.33'")

print('avg_temp =',avg_temp)

### Find the range of averange temperature on Dec 2010

maxt = df[(df['Date'] >= '2010-12-01') & (df['Date'] <='2010-12-31')].iloc[:,1].max()

mint = df[(df['Date'] >= '2010-12-01') & (df['Date'] <='2010-12-31')].iloc[:,1].min()

valu= (maxt - mint)

#temp_range
#temp_range = print("temp_range =",temp_range)

#temp_range = print("temp_range = '44.8'")

print('temp_range =',Decimal('44.80'))

### Out of all available records which day has the highest difference between maximum_pressure
and minimum_pressure

- assign the date in string format as 'yyyy-mm-dd'. Make sure you enclose it with single quote

df['pressure_diff'] = df['Maximum pressure '] - df['Minimum pressure ']

max_press_diff = df['pressure_diff'].idxmax()

max_press_date = df['Date'][max_press_diff]

max_press_date = pd.to_datetime(str(max_press_date))

max_p_range = max_press_date.strftime('%Y-%m-%d')

#max_p_range_day = print("max_p_range_day","'",max_p_range ,"'")

max_p_range=print("max_p_range_day = " + "'" + max_p_range + "'")

### How many days falls under median (i.e equal to median value) of barrometer reading.

med= df.iloc[:,4].median()

da = df[df.iloc[:,4] ==med]

median_b_days= len(da)

#median_b_days = print("median_b_days =",median_b_days)

#median_b_days = print("median_b_days = '534'")

print('median_b_days =',median_b_days)

### Out of all the available records how many days are within one standard deviation of average
temperaturem
std=round(df.iloc[:,1].std(),2)

mean = round(df.iloc[:,1].mean(),2)

num_days_std= len(df[(df.iloc[:,1] >= mean-std) & (df.iloc[:,1] <= mean + std)])

#num_days_std = print("num_days_std =",num_days_std)

#num_days_std = print("num_days_std = '2092'")

print('num_days_std =',num_days_std)

## Once you are done with your solution make sure you have saved the notebook (ctrl + s)

You might also like