Download as pdf or txt
Download as pdf or txt
You are on page 1of 18

1.

Intro read data and ploting

January 22, 2021

[3]: import pandas as pd

[6]: import numpy as np

[7]: import matplotlib.pyplot as plt

[185]: import matplotlib as mpl

[ ]:

[188]: mpl.style.use('ggplot')

[8]: df=pd.read_csv('C:/Users/Nazakat ali/Desktop/Stat711/New folder/vehicle.csv')

[10]: df

[10]: Vehicle fm Mileage lh lc mc State


0 1 0.0 863.0 1.1 66.30 697.23 MS
1 2 10.0 4644.0 2.4 233.03 119.66 CA
2 3 15.0 16330.0 4.2 325.08 175.46 WI
3 4 0.0 13.0 1.0 66.64 0.00 OR
4 5 13.0 22537.0 4.5 328.66 175.46 AZ
… … … … … … … …
1619 1620 11.0 15565.0 33.9 3234.41 2046.03 HI
1620 1621 0.0 2.0 2.1 185.08 0.00 CA
1621 1622 14.0 17195.0 4.5 318.10 371.59 IL
1622 1623 13.0 28125.0 3.5 306.53 152.05 NJ
1623 1624 23.0 33011.0 1.8 129.04 119.66 PA

[1624 rows x 7 columns]

[15]: print(df.head())

Vehicle fm Mileage lh lc mc State


0 1 0.0 863.0 1.1 66.30 697.23 MS
1 2 10.0 4644.0 2.4 233.03 119.66 CA
2 3 15.0 16330.0 4.2 325.08 175.46 WI
3 4 0.0 13.0 1.0 66.64 0.00 OR
4 5 13.0 22537.0 4.5 328.66 175.46 AZ

1
[16]: print(df.shape)

(1624, 7)

[17]: print(df.columns)

Index(['Vehicle', 'fm', 'Mileage', 'lh', 'lc', 'mc', 'State'], dtype='object')

[18]: print(df.dtypes)

Vehicle int64
fm float64
Mileage float64
lh float64
lc float64
mc float64
State object
dtype: object

[19]: print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1624 entries, 0 to 1623
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Vehicle 1624 non-null int64
1 fm 1624 non-null float64
2 Mileage 1624 non-null float64
3 lh 1624 non-null float64
4 lc 1624 non-null float64
5 mc 1624 non-null float64
6 State 1624 non-null object
dtypes: float64(5), int64(1), object(1)
memory usage: 88.9+ KB
None

[24]: Mileage_df=df['Mileage']
print(Mileage_df)

0 863.0
1 4644.0
2 16330.0
3 13.0
4 22537.0

1619 15565.0
1620 2.0
1621 17195.0
1622 28125.0

2
1623 33011.0
Name: Mileage, Length: 1624, dtype: float64

[25]: print(Mileage_df.head())

0 863.0
1 4644.0
2 16330.0
3 13.0
4 22537.0
Name: Mileage, dtype: float64

[26]: print(Mileage_df.tail())

1619 15565.0
1620 2.0
1621 17195.0
1622 28125.0
1623 33011.0
Name: Mileage, dtype: float64

[29]: subst_df=df[['Vehicle', 'fm', 'State']]


print(subst_df)

Vehicle fm State
0 1 0.0 MS
1 2 10.0 CA
2 3 15.0 WI
3 4 0.0 OR
4 5 13.0 AZ
… … … …
1619 1620 11.0 HI
1620 1621 0.0 CA
1621 1622 14.0 IL
1622 1623 13.0 NJ
1623 1624 23.0 PA

[1624 rows x 3 columns]

[ ]:

[31]: subst_df.loc[1]

[31]: Vehicle 2
fm 10
State CA
Name: 1, dtype: object

[34]: print(subst_df.head(n=5))

3
Vehicle fm State
0 1 0.0 MS
1 2 10.0 CA
2 3 15.0 WI
3 4 0.0 OR
4 5 13.0 AZ

[38]: print(subst_df.loc[[0,3,4,7,8]])

Vehicle fm State
0 1 0.0 MS
3 4 0.0 OR
4 5 13.0 AZ
7 8 5.0 GA
8 9 8.0 WA

[39]: print(subst_df.iloc[8])

Vehicle 9
fm 8
State WA
Name: 8, dtype: object

[40]: # Last row


print(subst_df.iloc[-1])

Vehicle 1624
fm 23
State PA
Name: 1623, dtype: object

[41]: # Get the first 100th and 1000th rows


print(subst_df.iloc[[0,3,4,7,8]])

Vehicle fm State
0 1 0.0 MS
3 4 0.0 OR
4 5 13.0 AZ
7 8 5.0 GA
8 9 8.0 WA

[ ]: subst_df_loc=subst_df.loc[:,['Vehicle', 'fm']]
print(subst_df_loc)

[44]: #iloc will alow us to integer


# -2 last column
print(subst_df.iloc[[0,3,4,7,-2]])

Vehicle fm State
0 1 0.0 MS

4
3 4 0.0 OR
4 5 13.0 AZ
7 8 5.0 GA
1622 1623 13.0 NJ

[11]: df.groupby('Vehicle')['fm'].mean()

[11]: Vehicle
1 0.0
2 10.0
3 15.0
4 0.0
5 13.0

1620 11.0
1621 0.0
1622 14.0
1623 13.0
1624 23.0
Name: fm, Length: 1624, dtype: float64

[36]: multi_group_df=df.\
groupby(['Vehicle', 'State'])\
[['lc', 'mc', 'lh', 'Mileage']].\
mean()

[37]: multi_group_df

[37]: lc mc lh Mileage
Vehicle State
1 MS 66.30 697.23 1.1 863.0
2 CA 233.03 119.66 2.4 4644.0
3 WI 325.08 175.46 4.2 16330.0
4 OR 66.64 0.00 1.0 13.0
5 AZ 328.66 175.46 4.5 22537.0
… … … … …
1620 HI 3234.41 2046.03 33.9 15565.0
1621 CA 185.08 0.00 2.1 2.0
1622 IL 318.10 371.59 4.5 17195.0
1623 NJ 306.53 152.05 3.5 28125.0
1624 PA 129.04 119.66 1.8 33011.0

[1624 rows x 4 columns]

[41]: df_uniq=df.groupby('State')['lc'].nunique()
df_uniq

5
[41]: State
* 1
AK 4
AL 26
AR 9
AZ 57
CA 183
CO 35
CT 12
DE 1
FL 160
GA 69
HI 4
IA 11
ID 17
IL 15
IN 15
KS 14
KY 14
LA 48
MA 21
MD 12
ME 4
MI 36
MN 20
MO 30
MS 12
MT 10
NC 44
ND 2
NE 8
NH 5
NJ 21
NM 12
NV 18
NY 30
OH 28
OK 32
OR 28
PA 40
RI 2
SC 30
SD 11
TN 29
TX 279
UT 20
VA 30

6
VT 3
WA 22
WI 13
WV 3
WY 4
Name: lc, dtype: int64

[42]: df_uniq.plot()

[42]: <AxesSubplot:xlabel='State'>

[46]: import matplotlib.pyplot as plt

[56]: plt.plot(df['lc'])

[56]: [<matplotlib.lines.Line2D at 0x268816a8af0>]

7
[77]: df_uniq.plot(kind='box')
plt.show()

8
[83]: ubst_df_1=df[['Vehicle', 'fm', 'lc']]
ubst_df_1

[83]: Vehicle fm lc
0 1 0.0 66.30
1 2 10.0 233.03
2 3 15.0 325.08
3 4 0.0 66.64
4 5 13.0 328.66
… … … …
1619 1620 11.0 3234.41
1620 1621 0.0 185.08
1621 1622 14.0 318.10
1622 1623 13.0 306.53
1623 1624 23.0 129.04

[1624 rows x 3 columns]

[106]: ubst_df_1.plot(kind='box', figsize=(10,7), color='red', vert=True)


plt.title('Box plot deffrerr')

[106]: Text(0.5, 1.0, 'Box plot deffrerr')

9
[116]: fig=plt.figure() # creat figure
# add plot in figure
ax1=fig.add_subplot(2,2,1) # add subplot 1 (1 row, 2 coloumns, first plot)
ax2=fig.add_subplot(2,2,2) # add subplot 2 (1 row, 2 coloumns, second plot)
# subplot first 1
df.plot(kind='box', figsize=(10,7), color='red', vert=True, ax=ax1)
# 2 plot line
df.plot(kind='line', figsize=(20,7), ax=ax2)

[116]: <AxesSubplot:>

[119]: ubst_df_1.plot(kind='hist',figsize=(8,5))

[119]: <AxesSubplot:ylabel='Frequency'>

[123]: stat=df[['State']]
stat

10
[123]: State
0 MS
1 CA
2 WI
3 OR
4 AZ
… …
1619 HI
1620 CA
1621 IL
1622 NJ
1623 PA

[1624 rows x 1 columns]

[128]: df_uniq.plot(kind='box',figsize=(8,5))

[128]: <AxesSubplot:>

[129]: df_uniq1=df.groupby('lc')['State'].nunique()
df_uniq1

[129]: lc
0.00 19
9.45 1

11
10.57 1
11.00 1
11.84 1
..
1711.36 1
1903.43 1
2263.12 1
2652.37 1
3234.41 1
Name: State, Length: 1511, dtype: int64

[152]: subst_df_loc1=df[['State','lc']]
subst_df_loc1

[152]: State lc
0 MS 66.30
1 CA 233.03
2 WI 325.08
3 OR 66.64
4 AZ 328.66
… … …
1619 HI 3234.41
1620 CA 185.08
1621 IL 318.10
1622 NJ 306.53
1623 PA 129.04

[1624 rows x 2 columns]

[161]: subst_df_loc1.plot(kind='hist',figsize=(20,5))

[161]: <AxesSubplot:ylabel='Frequency'>

[168]: df3=df[['Vehicle','fm', 'lc', 'Mileage']]


df3

12
[168]: Vehicle fm lc Mileage
0 1 0.0 66.30 863.0
1 2 10.0 233.03 4644.0
2 3 15.0 325.08 16330.0
3 4 0.0 66.64 13.0
4 5 13.0 328.66 22537.0
… … … … …
1619 1620 11.0 3234.41 15565.0
1620 1621 0.0 185.08 2.0
1621 1622 14.0 318.10 17195.0
1622 1623 13.0 306.53 28125.0
1623 1624 23.0 129.04 33011.0

[1624 rows x 4 columns]

[189]: df3.plot(kind='hist',
figsize=(10,6),
alpha=0.6,
color=['g','r','b','y'])
plt.title('Histgram df table')
plt.ylabel('Output')
plt.xlabel('Components')

[189]: Text(0.5, 0, 'Components')

13
[191]: 'Group States by other variable '

[191]: 'Group States by other variable '

[190]: stat1=df.groupby('State', axis=0).sum()


stat1

[190]: Vehicle fm Mileage lh lc mc


State
* 1348 6.0 59525.0 0.9 37.17 114.53
AK 2177 26.0 65424.0 7.1 642.77 534.44
AL 23529 215.0 554370.0 118.5 7400.03 6483.77
AR 7506 60.0 104540.0 29.7 1731.98 2135.88
AZ 50349 697.0 1344387.0 253.3 19224.73 11813.38
CA 162456 1968.0 3522401.0 668.6 57020.03 35215.61
CO 21676 295.0 589308.0 103.4 8298.98 5920.27
CT 12255 60.0 124794.0 32.5 2471.53 2779.75
DE 968 2.0 4364.0 0.7 43.82 119.66
FL 146694 1666.0 3721543.0 657.9 46282.40 29782.09
GA 64808 621.0 1633175.0 216.9 15170.25 15533.79
HI 4065 46.0 54564.0 42.8 4038.15 2556.04
IA 9895 89.0 133925.0 43.4 2746.91 2673.04
ID 11133 176.0 323429.0 46.1 3276.81 2306.95
IL 12329 192.0 336161.0 36.7 2816.61 2645.83
IN 10504 178.0 441549.0 39.1 2749.39 1872.99
KS 13637 123.0 316863.0 25.3 1757.24 1860.16
KY 12530 117.0 260280.0 36.1 2400.31 2267.71
LA 40523 432.0 1073698.0 113.9 7880.17 6716.38
MA 15565 194.0 324436.0 43.7 3604.91 2443.03
MD 9993 97.0 173615.0 25.7 1995.79 1352.39
ME 4405 45.0 86086.0 10.5 702.87 1351.85
MI 24984 305.0 670523.0 113.6 8019.60 7692.48
MN 10404 167.0 346951.0 55.5 4051.66 3196.98
MO 20314 297.0 742507.0 92.5 6913.26 5529.12
MS 10634 135.0 349825.0 37.5 2422.65 3070.48
MT 9218 107.0 217460.0 25.0 1622.74 1318.94
NC 45532 526.0 1099295.0 151.5 10296.83 8990.30
ND 919 14.0 37525.0 5.6 373.57 295.12
NE 5247 80.0 222420.0 26.0 1450.10 1558.51
NH 2500 35.0 66901.0 10.0 628.84 151.43
NJ 22540 165.0 264992.0 79.7 6955.37 7355.30
NM 6850 86.0 197359.0 58.3 4104.38 2063.58
NV 19054 182.0 273121.0 84.6 6880.48 4126.82
NY 27477 202.0 311090.0 101.2 7840.71 4689.93
OH 13424 264.0 670864.0 88.8 6034.52 5266.70
OK 34134 273.0 769424.0 96.6 6602.07 5118.16
OR 16460 228.0 418781.0 74.5 5452.90 3950.32

14
PA 45137 355.0 763958.0 109.9 7203.17 7549.29
RI 2858 8.0 14253.0 4.2 326.34 295.92
SC 21617 247.0 592642.0 108.8 7439.07 7082.80
SD 10445 81.0 176511.0 21.5 1437.62 1619.12
TN 22394 284.0 789815.0 95.1 6243.95 5648.47
TX 225444 3002.0 7381131.0 1094.9 79140.87 51613.97
UT 16024 207.0 361139.0 68.7 5329.92 4105.56
VA 28526 323.0 598801.0 74.6 5437.00 4512.68
VT 3525 18.0 22328.0 10.7 624.40 358.98
WA 17264 193.0 380889.0 73.1 5610.80 2894.74
WI 11125 115.0 234319.0 42.7 2863.82 1997.52
WV 3409 23.0 61550.0 5.2 277.88 239.32
WY 3696 62.0 134949.0 8.8 621.48 565.12

[200]: stat1.plot(kind='hist',
figsize=(10,6),
alpha=0.6,
color=['g','r','b','y'])
plt.title('Histgram df table')
plt.ylabel('Output')
plt.xlabel('State')

[200]: Text(0.5, 0, 'State')

15
[221]: # Pie plote
stat1['mc'].plot(kind='pie',
figsize=(15,6),
autopct='%1.0f%%',
startangle=150,
shadow=True,
labels=None,
pctdistance=1.12,)
plt.legend(labels=stat1.index, loc='upper left')

[221]: <matplotlib.legend.Legend at 0x268c11cdac0>

16
17
[232]: stat1.plot(kind='bar', figsize=(20,10), rot=60)
plt.plot('State')
plt.annotate('high',
xy=(1,100000),
)

[232]: Text(1, 100000, 'high')

[ ]:

18

You might also like