Professional Documents
Culture Documents
Intro Read Data and Ploting
Intro Read Data and Ploting
[ ]:
[188]: mpl.style.use('ggplot')
[10]: df
[15]: print(df.head())
1
[16]: print(df.shape)
(1624, 7)
[17]: print(df.columns)
[18]: print(df.dtypes)
Vehicle int64
fm float64
Mileage float64
lh float64
lc float64
mc float64
State object
dtype: object
[19]: print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1624 entries, 0 to 1623
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Vehicle 1624 non-null int64
1 fm 1624 non-null float64
2 Mileage 1624 non-null float64
3 lh 1624 non-null float64
4 lc 1624 non-null float64
5 mc 1624 non-null float64
6 State 1624 non-null object
dtypes: float64(5), int64(1), object(1)
memory usage: 88.9+ KB
None
[24]: Mileage_df=df['Mileage']
print(Mileage_df)
0 863.0
1 4644.0
2 16330.0
3 13.0
4 22537.0
…
1619 15565.0
1620 2.0
1621 17195.0
1622 28125.0
2
1623 33011.0
Name: Mileage, Length: 1624, dtype: float64
[25]: print(Mileage_df.head())
0 863.0
1 4644.0
2 16330.0
3 13.0
4 22537.0
Name: Mileage, dtype: float64
[26]: print(Mileage_df.tail())
1619 15565.0
1620 2.0
1621 17195.0
1622 28125.0
1623 33011.0
Name: Mileage, dtype: float64
Vehicle fm State
0 1 0.0 MS
1 2 10.0 CA
2 3 15.0 WI
3 4 0.0 OR
4 5 13.0 AZ
… … … …
1619 1620 11.0 HI
1620 1621 0.0 CA
1621 1622 14.0 IL
1622 1623 13.0 NJ
1623 1624 23.0 PA
[ ]:
[31]: subst_df.loc[1]
[31]: Vehicle 2
fm 10
State CA
Name: 1, dtype: object
[34]: print(subst_df.head(n=5))
3
Vehicle fm State
0 1 0.0 MS
1 2 10.0 CA
2 3 15.0 WI
3 4 0.0 OR
4 5 13.0 AZ
[38]: print(subst_df.loc[[0,3,4,7,8]])
Vehicle fm State
0 1 0.0 MS
3 4 0.0 OR
4 5 13.0 AZ
7 8 5.0 GA
8 9 8.0 WA
[39]: print(subst_df.iloc[8])
Vehicle 9
fm 8
State WA
Name: 8, dtype: object
Vehicle 1624
fm 23
State PA
Name: 1623, dtype: object
Vehicle fm State
0 1 0.0 MS
3 4 0.0 OR
4 5 13.0 AZ
7 8 5.0 GA
8 9 8.0 WA
[ ]: subst_df_loc=subst_df.loc[:,['Vehicle', 'fm']]
print(subst_df_loc)
Vehicle fm State
0 1 0.0 MS
4
3 4 0.0 OR
4 5 13.0 AZ
7 8 5.0 GA
1622 1623 13.0 NJ
[11]: df.groupby('Vehicle')['fm'].mean()
[11]: Vehicle
1 0.0
2 10.0
3 15.0
4 0.0
5 13.0
…
1620 11.0
1621 0.0
1622 14.0
1623 13.0
1624 23.0
Name: fm, Length: 1624, dtype: float64
[36]: multi_group_df=df.\
groupby(['Vehicle', 'State'])\
[['lc', 'mc', 'lh', 'Mileage']].\
mean()
[37]: multi_group_df
[37]: lc mc lh Mileage
Vehicle State
1 MS 66.30 697.23 1.1 863.0
2 CA 233.03 119.66 2.4 4644.0
3 WI 325.08 175.46 4.2 16330.0
4 OR 66.64 0.00 1.0 13.0
5 AZ 328.66 175.46 4.5 22537.0
… … … … …
1620 HI 3234.41 2046.03 33.9 15565.0
1621 CA 185.08 0.00 2.1 2.0
1622 IL 318.10 371.59 4.5 17195.0
1623 NJ 306.53 152.05 3.5 28125.0
1624 PA 129.04 119.66 1.8 33011.0
[41]: df_uniq=df.groupby('State')['lc'].nunique()
df_uniq
5
[41]: State
* 1
AK 4
AL 26
AR 9
AZ 57
CA 183
CO 35
CT 12
DE 1
FL 160
GA 69
HI 4
IA 11
ID 17
IL 15
IN 15
KS 14
KY 14
LA 48
MA 21
MD 12
ME 4
MI 36
MN 20
MO 30
MS 12
MT 10
NC 44
ND 2
NE 8
NH 5
NJ 21
NM 12
NV 18
NY 30
OH 28
OK 32
OR 28
PA 40
RI 2
SC 30
SD 11
TN 29
TX 279
UT 20
VA 30
6
VT 3
WA 22
WI 13
WV 3
WY 4
Name: lc, dtype: int64
[42]: df_uniq.plot()
[42]: <AxesSubplot:xlabel='State'>
[56]: plt.plot(df['lc'])
7
[77]: df_uniq.plot(kind='box')
plt.show()
8
[83]: ubst_df_1=df[['Vehicle', 'fm', 'lc']]
ubst_df_1
[83]: Vehicle fm lc
0 1 0.0 66.30
1 2 10.0 233.03
2 3 15.0 325.08
3 4 0.0 66.64
4 5 13.0 328.66
… … … …
1619 1620 11.0 3234.41
1620 1621 0.0 185.08
1621 1622 14.0 318.10
1622 1623 13.0 306.53
1623 1624 23.0 129.04
9
[116]: fig=plt.figure() # creat figure
# add plot in figure
ax1=fig.add_subplot(2,2,1) # add subplot 1 (1 row, 2 coloumns, first plot)
ax2=fig.add_subplot(2,2,2) # add subplot 2 (1 row, 2 coloumns, second plot)
# subplot first 1
df.plot(kind='box', figsize=(10,7), color='red', vert=True, ax=ax1)
# 2 plot line
df.plot(kind='line', figsize=(20,7), ax=ax2)
[116]: <AxesSubplot:>
[119]: ubst_df_1.plot(kind='hist',figsize=(8,5))
[119]: <AxesSubplot:ylabel='Frequency'>
[123]: stat=df[['State']]
stat
10
[123]: State
0 MS
1 CA
2 WI
3 OR
4 AZ
… …
1619 HI
1620 CA
1621 IL
1622 NJ
1623 PA
[128]: df_uniq.plot(kind='box',figsize=(8,5))
[128]: <AxesSubplot:>
[129]: df_uniq1=df.groupby('lc')['State'].nunique()
df_uniq1
[129]: lc
0.00 19
9.45 1
11
10.57 1
11.00 1
11.84 1
..
1711.36 1
1903.43 1
2263.12 1
2652.37 1
3234.41 1
Name: State, Length: 1511, dtype: int64
[152]: subst_df_loc1=df[['State','lc']]
subst_df_loc1
[152]: State lc
0 MS 66.30
1 CA 233.03
2 WI 325.08
3 OR 66.64
4 AZ 328.66
… … …
1619 HI 3234.41
1620 CA 185.08
1621 IL 318.10
1622 NJ 306.53
1623 PA 129.04
[161]: subst_df_loc1.plot(kind='hist',figsize=(20,5))
[161]: <AxesSubplot:ylabel='Frequency'>
12
[168]: Vehicle fm lc Mileage
0 1 0.0 66.30 863.0
1 2 10.0 233.03 4644.0
2 3 15.0 325.08 16330.0
3 4 0.0 66.64 13.0
4 5 13.0 328.66 22537.0
… … … … …
1619 1620 11.0 3234.41 15565.0
1620 1621 0.0 185.08 2.0
1621 1622 14.0 318.10 17195.0
1622 1623 13.0 306.53 28125.0
1623 1624 23.0 129.04 33011.0
[189]: df3.plot(kind='hist',
figsize=(10,6),
alpha=0.6,
color=['g','r','b','y'])
plt.title('Histgram df table')
plt.ylabel('Output')
plt.xlabel('Components')
13
[191]: 'Group States by other variable '
14
PA 45137 355.0 763958.0 109.9 7203.17 7549.29
RI 2858 8.0 14253.0 4.2 326.34 295.92
SC 21617 247.0 592642.0 108.8 7439.07 7082.80
SD 10445 81.0 176511.0 21.5 1437.62 1619.12
TN 22394 284.0 789815.0 95.1 6243.95 5648.47
TX 225444 3002.0 7381131.0 1094.9 79140.87 51613.97
UT 16024 207.0 361139.0 68.7 5329.92 4105.56
VA 28526 323.0 598801.0 74.6 5437.00 4512.68
VT 3525 18.0 22328.0 10.7 624.40 358.98
WA 17264 193.0 380889.0 73.1 5610.80 2894.74
WI 11125 115.0 234319.0 42.7 2863.82 1997.52
WV 3409 23.0 61550.0 5.2 277.88 239.32
WY 3696 62.0 134949.0 8.8 621.48 565.12
[200]: stat1.plot(kind='hist',
figsize=(10,6),
alpha=0.6,
color=['g','r','b','y'])
plt.title('Histgram df table')
plt.ylabel('Output')
plt.xlabel('State')
15
[221]: # Pie plote
stat1['mc'].plot(kind='pie',
figsize=(15,6),
autopct='%1.0f%%',
startangle=150,
shadow=True,
labels=None,
pctdistance=1.12,)
plt.legend(labels=stat1.index, loc='upper left')
16
17
[232]: stat1.plot(kind='bar', figsize=(20,10), rot=60)
plt.plot('State')
plt.annotate('high',
xy=(1,100000),
)
[ ]:
18