Professional Documents
Culture Documents
Pandas Manipulations
Pandas Manipulations
Series Creation
In [3]: # Creating series from List
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturd
days_ser = pd.Series(days)
print(days_ser, type(days_ser))
0 Sunday
1 Monday
2 Tuesday
3 Wednesday
4 Thursday
5 Friday
6 Saturday
In [515]: print(days_ser[0])
Sunday
In [516]: print(days_ser[len(days_ser)-1])
print(days_ser.shape)
print(days_ser.size)
Saturday
(7,)
In [518]: # Slicing
print(days_ser[1:4])
1 Monday
2 Tuesday
3 Wednesday
dtype: object
In [520]: days_ser
day2 Monday
day3 Tuesday
day4 Wednesday
day5 Thursday
day6 Friday
day7 Saturday
dtype: object
In [521]: print(days_ser['day1'])
print(days_ser['day6'])
Sunday
Friday
In [522]: days_ser['day3':'day5': ]
day4 Wednesday
day5 Thursday
dtype: object
day6 Friday
day5 Thursday
day4 Wednesday
day3 Tuesday
day2 Monday
day1 Sunday
dtype: object
Out[524]: st1 MH
st2 UP
st3 MP
st4 AP
st5 KA
st6 TN
st7 WB
st8 RJ
st9 DL
dtype: object
In [525]: state_ser[0]
Out[525]: 'MH'
In [527]: # Slicing
state_ser[3:6:2]
Out[527]: st4 AP
st6 TN
dtype: object
Out[528]: a MUM
b LKO
c BHP
d AMT
a BLR
e CHN
d KOL
f JP
g DL
dtype: object
In [529]: capitals_ser['a']
Out[529]: a MUM
a BLR
dtype: object
In [530]: # capitals_ser['a':'a'] # KeyError: "Cannot get left slice bound for non
Out[531]: MH MUM
UP LKO
MP BHP
AP AMT
KA BLR
TN CHN
WB KOL
RJ JP
DL DL
dtype: object
Out[532]: a NaN
b NaN
c NaN
d NaN
e NaN
f NaN
g NaN
h NaN
i NaN
j NaN
dtype: object
Out[533]: MH MUM
UP LKO
MP BHP
AP AMT
KA BLR
TN CHN
WB KOL
RJ JP
DL DL
PN NaN
dtype: object
In [535]: state_cap
Out[535]: MH MUM
UP LKO
MP BHP
AP AMT
KA BLR
TN CHN
WB KOL
RJ JP
DL DL
PN Chandigarh
JK Kashmir
ZA Zarkhand
dtype: object
DataFrame
In [536]: # Creating dataframe from Dictionary
df_dict = {'Year' : [1990, 1994, 1998, 2002],
'Country' : ['Italy', 'USA', 'France', 'Japan'],
'Winner' : ['Germany', 'Brazil', 'France', 'Brazil'],
'GoalScored' : [115, 141, 171, 161]
}
df_dict = pd.DataFrame(df_dict)
df_dict
Out[536]:
Year Country Winner GoalScored
In [537]: print(type(df_dict))
<class 'pandas.core.frame.DataFrame'>
Out[538]:
Year Country Winner GoalScored
Out[539]:
Year Country Winner GoalScored
Out[540]:
year HostCountry Winner
pd.read_csv
Read a comma-separated values (csv) file into DataFrame.
pd.read_csv(
filepath_or_buffer: 'FilePathOrBuffer',
sep=,
delimiter=None,
header='infer',
names=,
index_col=None,
usecols=None,
squeeze=False,
prefix=,
mangle_dupe_cols=True,
dtype:
'DtypeArg | None' = None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skipinitialspace=False,
skiprows=None,
skipfooter=0,
nrows=None,
na_values=None,
keep_default_na=True,
na_filter=True,
verbose=False,
skip_blank_lines=True,
parse_dates=False,
infer_datetime_format=False,
keep_date_col=False,
date_parser=None,
dayfirst=False,
cache_dates=True,
iterator=False,
chunksize=None,
compression='infer',
thousands=None,
decimal: 'str' = '.',
lineterminator=None,
quotechar='"',
quoting=0,
doublequote=True,
escapechar=None,
comment=None,
encoding=None,
encoding_errors: 'str |
None' = 'strict',
dialect=None,
error_bad_lines=None,
warn_bad_lines=None,
on_bad_lines=None,
delim_whitespace=False,
low_memory=True,
memory_map=False,
float_precision=None,
storage_options: 'StorageOptions' = None,
)
Out[541]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-03-
0 Atlanta organic 89424.11 207.08 89631.19 190257.38 1.70
25
2018-03-
1 Atlanta conventional 102717.50 153.00 102870.50 202790.74 1.75
18
2018-03-
2 Boston organic 120465.39 18.83 120484.22 236822.98 1.58
11
2018-03-
3 Boston conventional 136877.43 60.60 136938.03 239135.67 1.57
04
2018-02-
4 California organic 66273.89 46.58 66320.47 179041.72 1.82
25
Out[542]:
Region Type AveragePrice
Out[544]:
Region Type AveragePrice
pd.read_excel
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 8/70
5/11/22, 9:50 PM
p _ Pandas Full Tutorial - Jupyter Notebook
Out[3]:
Year Country Winner Runners-Up GoalsScored MatchesPlayed
pd.read_clipboard
Read text from clipboard and pass to read_csv.
Out[546]:
0 1 2 3 4 5 6
pd.get_dummies
Convert categorical variable into dummy/indicator variables.
In [547]: avocado_data.head()
Out[547]:
Region Type AveragePrice
Out[548]:
AveragePrice Region_Atlanta Region_Boston Region_California Region_NewYork Region_SanFr
0 1.70 1 0 0 0
1 1.75 1 0 0 0
2 1.58 0 1 0 0
3 1.57 0 1 0 0
4 1.82 0 0 1 0
5 1.01 0 0 1 0
6 1.38 0 0 0 1
7 1.29 0 0 0 1
8 1.16 0 0 0 0
9 1.17 0 0 0 0
Out[550]:
AveragePrice Region_Boston Region_California Region_NewYork Region_SanFrancisco Type_o
0 1.70 0 0 0 0
1 1.75 0 0 0 0
2 1.58 1 0 0 0
3 1.57 1 0 0 0
4 1.82 0 1 0 0
5 1.01 0 1 0 0
6 1.38 0 0 1 0
7 1.29 0 0 1 0
8 1.16 0 0 0 1
9 1.17 0 0 0 1
pd.to_datetime
Convert argument to datetime.
Out[552]:
Date Confirmed Deaths Recovered
0 1/22/20 555 17 28
1 1/23/20 654 18 30
2 1/24/20 941 26 36
3 1/25/20 1434 42 39
4 1/26/20 2118 56 52
<class 'pandas.core.frame.DataFrame'>
Out[555]: 0 2020-01-22
1 2020-01-23
2 2020-01-24
3 2020-01-25
4 2020-01-26
...
259 2020-09-05
260 2020-09-06
261 2020-09-07
262 2020-09-08
263 2020-09-09
pd.to_numeric
Convert argument to a numeric type.
In [556]: pd.to_numeric(daywise['Deaths'])
Out[556]: 0 17
1 18
2 26
3 42
4 56
...
259 879645
260 883414
261 892726
262 897463
263 903759
In [ ]:
pd.unique
Uniques are returned in order
of appearance. This does NOT sort.
In [567]: avocado_data.head()
Out[567]:
Region Type AveragePrice
In [565]: pd.unique(avocado_data['Region'])
dtype=object)
pd.value_counts
Value counts of unique data
In [566]: pd.value_counts(avocado_data['Region'])
Out[566]: Atlanta 2
Boston 2
California 2
NewYork 2
SanFrancisco 2
pd.factorize
Encode the object as an enumerated type or categorical variable.
In [571]: codes
In [572]: uniques
df.abs
Return a Series/DataFrame with absolute numeric value of each element.
Out[573]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-03-
0 Atlanta organic 89424.11 207.08 89631.19 190257.38 1.70
25
2018-03-
1 Atlanta conventional 102717.50 153.00 102870.50 202790.74 1.75
18
2018-03-
2 Boston organic 120465.39 18.83 120484.22 236822.98 1.58
11
2018-03-
3 Boston conventional 136877.43 60.60 136938.03 239135.67 1.57
04
2018-02-
4 California organic 66273.89 46.58 66320.47 179041.72 1.82
25
In [575]: avocado_data.head()
Out[575]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-03-
0 Atlanta organic 89424.11 -340.80 89631.19 190257.38 1.70
25
2018-03-
1 Atlanta conventional 102717.50 153.00 102870.50 202790.74 1.75
18
2018-03-
2 Boston organic 120465.39 18.83 120484.22 236822.98 1.58
11
2018-03-
3 Boston conventional 136877.43 60.60 136938.03 239135.67 1.57
04
2018-02-
4 California organic 66273.89 46.58 66320.47 179041.72 1.82
25
In [578]: avocado_data.head()
Out[578]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-03-
0 Atlanta organic 89424.11 340.80 89631.19 190257.38 1.70
25
2018-03-
1 Atlanta conventional 102717.50 153.00 102870.50 202790.74 1.75
18
2018-03-
2 Boston organic 120465.39 18.83 120484.22 236822.98 1.58
11
2018-03-
3 Boston conventional 136877.43 60.60 136938.03 239135.67 1.57
04
2018-02-
4 California organic 66273.89 46.58 66320.47 179041.72 1.82
25
df.add
Get Addition of dataframe and other, element-wise (binary operator add ).
Out[579]:
Large Bags Total Bags AveragePrice
df.add_prefix
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 15/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
In [581]: avocado_data.add_prefix('New_')
Out[581]:
New_Small New_Large New_Total New_Total
New_Region New_Type New_AveragePrice N
Bags Bags Bags Volume
pd.add_suffix
For DataFrame, the column labels are suffixed.
In [582]: avocado_data.add_suffix('_New')
Out[582]:
Small Large Total Total
Region_New Type_New AveragePrice_New D
Bags_New Bags_New Bags_New Volume_New
df.agg
Aggregate using one or more operations over the specified axis.
Out[583]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.aggregate
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 17/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
Out[587]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.all
Return whether all elements are True, potentially over an axis.
In [588]: avocado_data_num.all()
AveragePrice True
dtype: bool
df.any
Return whether any element is True, potentially over an axis.
In [589]: avocado_data_num.any()
AveragePrice True
dtype: bool
df.append
In [590]: avocado_data_num
Out[590]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
In [592]: new_data
Out[592]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
0 10 11 12 13 14
In [593]: avocado_data_num.append(new_data)
Out[593]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[594]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.apply
Apply a function along an axis of the DataFrame.
Out[596]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[597]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.applymap
Apply a function to a Dataframe elementwise.
Out[598]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.astype
Cast a pandas object to a specified dtype dtype .
In [599]: avocado_data_num.astype('int64')
Out[599]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.at
Access a single value for a row/column label pair.
In [601]: avocado_data_num
Out[601]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[602]: 340.8
In [604]: avocado_data_num.head(3)
Out[604]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.iat
Access a single value for a row/column pair by integer position.
In [605]: avocado_data_num.iat[0, 1]
Out[605]: 100.0
df.boxplot
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 23/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
In [606]: avocado_data_num.boxplot()
df.columns
Gives columns of Dataframe
In [610]: avocado_data_num.columns
'AveragePrice'],
dtype='object')
df.corr
Compute pairwise correlation of columns, excluding NA/null values.
In [611]: avocado_data_num.corr()
Out[611]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.count
Count non-NA cells for each column or row.
If 0 or 'index' counts are generated for each column.
If
1 or 'columns' counts are generated for each row.
In [613]: avocado_data_num
Out[613]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Large Bags 10
Total Bags 10
Total Volume 10
AveragePrice 10
dtype: int64
Out[615]: 0 5
1 5
2 5
3 5
4 5
5 5
6 5
7 5
8 5
9 5
dtype: int64
df.cov
Compute pairwise covariance of columns, excluding NA/null values.
In [616]: avocado_data_num.cov()
Out[616]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.cummax
Return cumulative sum over a DataFrame or Series axis.
In [618]: avocado_data_num
Out[618]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[619]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[620]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.cummin
Return cumulative minimum over a DataFrame or Series axis.
In [621]: avocado_data_num.cummin()
Out[621]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.describe
Generate descriptive statistics.
In [622]: avocado_data_num.describe()
Out[622]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
In [626]: avocado_data_num.describe(percentiles=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])
Out[626]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
df.drop
Drop specified labels from rows or columns.
In [628]: citibike_tripdata
Out[628]:
start start end end
tripduration starttime stoptime station station station station bikeid name_localizedValu
id name id name
2018-05- 2018-05-
Newport
0 338 01 01 3639 Harborside 3199 33558 Annual Membersh
Pkwy
00:04:47 00:10:25
2018-05- 2018-05-
1 1482 01 01 3681 Grand St 3185 City Hall 33593 24 Ho
01:31:10 01:55:53
2018-05- 2018-05-
Grove
3 190 01 01 3185 City Hall 3186 29662 24 Ho
St PATH
02:03:29 02:06:40
2018-05- 2018-05-
Oakland
4 303 01 01 3207 3195 Sip Ave 15271 Annual Membersh
Ave
04:27:12 04:32:16
Out[629]:
start start end end
tripduration starttime stoptime station station station station bikeid name_localizedValu
id name id name
2018-05- 2018-05-
Newport
0 338 01 01 3639 Harborside 3199 33558 Annual Membersh
Pkwy
00:04:47 00:10:25
2018-05- 2018-05-
1 1482 01 01 3681 Grand St 3185 City Hall 33593 24 Ho
01:31:10 01:55:53
2018-05- 2018-05-
Grove
3 190 01 01 3185 City Hall 3186 29662 24 Ho
St PATH
02:03:29 02:06:40
2018-05- 2018-05-
Oakland
4 303 01 01 3207 3195 Sip Ave 15271 Annual Membersh
Ave
04:27:12 04:32:16
Out[631]:
start start station end end station
tripduration starttime stoptime
station id name station id name
2018-05-01 2018-05-01
1 1482 3681 Grand St 3185 City Hall
01:31:10 01:55:53
2018-05-01 2018-05-01
4 303 3207 Oakland Ave 3195 Sip Ave
04:27:12 04:32:16
Out[632]:
start start end end
tripduration starttime stoptime station station station station bikeid name_localizedValu
id name id name
2018-05- 2018-05-
Newport
0 338 01 01 3639 Harborside 3199 33558 Annual Membersh
Pkwy
00:04:47 00:10:25
2018-05- 2018-05-
Oakland
4 303 01 01 3207 3195 Sip Ave 15271 Annual Membersh
Ave
04:27:12 04:32:16
drop_duplicates
Return DataFrame with duplicate rows removed.
In [634]: citibike_tripdata
Out[634]:
start start end end
tripduration starttime stoptime station station station station bikeid name_localizedValu
id name id name
2018-05- 2018-05-
Newport
0 338 01 01 3639 Harborside 3199 33558 Annual Membersh
Pkwy
00:04:47 00:10:25
2018-05- 2018-05-
1 1482 01 01 3681 Grand St 3185 City Hall 33593 24 Ho
01:31:10 01:55:53
2018-05- 2018-05-
Grove
3 190 01 01 3185 City Hall 3186 29662 24 Ho
St PATH
02:03:29 02:06:40
2018-05- 2018-05-
Oakland
4 303 01 01 3207 3195 Sip Ave 15271 Annual Membersh
Ave
04:27:12 04:32:16
df.dropna
Remove missing values.
Summary 0
Temperature (C) 0
Humidity 0
Visibility (km) 0
Loud Cover 0
Pressure (millibars) 0
Daily Summary 0
dtype: int64
In [638]: weatherHistory.isnull().sum()
Summary 0
Precip Type 0
Temperature (C) 0
Humidity 0
Visibility (km) 0
Loud Cover 0
Pressure (millibars) 0
Daily Summary 0
dtype: int64
df.dtypes
display datatypes of each column
In [639]: weatherHistory.dtypes
Summary object
Humidity float64
dtype: object
df.duplicated
Return boolean Series denoting duplicate rows.
In [643]: df = pd.DataFrame({
'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie','Indo
'style': ['cup', 'cup', 'cup', 'pack', 'pack','pack'],
'rating': [4, 4, 3.5, 15, 5,5]})
In [644]: df
Out[644]:
brand style rating
Out[645]: 0 False
1 True
2 False
3 False
4 False
5 True
dtype: bool
Out[646]:
brand style rating
df.explode
Transform each element of a list-like to a row, replicating index values.
Out[647]:
city day1 day2 day3 day4 day5
0 P 22 31 27 64 23
1 Q 25 12 20 47 54
Out[648]:
city day1 day2 day3 day4 day5
0 P 22 31 27 64 23
1 Q 25 12 20 47 54
2 R 21 67 41 24 16
3 R 21 67 45 24 16
4 R 21 67 67 24 16
5 R 21 67 90 24 16
6 R 21 67 21 24 16
df.fillna
Fill NA/NaN values using the specified method.
Out[652]:
Year Country Winner GoalScored
In [653]: data.isnull().sum()
Out[653]: Year 0
Country 1
Winner 1
GoalScored 1
dtype: int64
In [654]: data.fillna(0)
Out[654]:
Year Country Winner GoalScored
In [655]: data.fillna("Missing")
Out[655]:
Year Country Winner GoalScored
df.groupby
Group DataFrame using a mapper or by a Series of columns.
Out[656]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-03-
0 Atlanta organic 89424.11 207.08 89631.19 190257.38 1.70
25
2018-03-
1 Atlanta conventional 102717.50 153.00 102870.50 202790.74 1.75
18
2018-03-
2 Boston organic 120465.39 18.83 120484.22 236822.98 1.58
11
2018-03-
3 Boston conventional 136877.43 60.60 136938.03 239135.67 1.57
04
2018-02-
4 California organic 66273.89 46.58 66320.47 179041.72 1.82
25
In [657]: g = avocado_data.groupby(by='Type')
'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov',
'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'ewm', 'expanding',
'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad',
'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change',
'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sample', 'sem', 'shift', 'size', 'skew', 'std',
'sum', 'tail', 'take', 'transform', 'tshift', 'var'
In [658]: g.mean()
Out[658]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Type
In [659]: g.min()
Out[659]:
Small Large Total Total
Region AveragePrice Date
Bags Bags Bags Volume
Type
In [660]: g.max()
Out[660]:
Small Large Total Total
Region AveragePrice Date
Bags Bags Bags Volume
Type
2018-03-
conventional SanFrancisco 193813.92 609.20 197281.89 1203274.11 1.75
25
2018-03-
organic SanFrancisco 231913.11 1286.43 236417.93 1051308.50 1.82
25
In [661]: g.corr()
Out[661]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Type
In [662]: g.describe()
Out[662]:
Small Bags
Type
2 rows × 40 columns
df.head
Return the first n rows. default 5
Out[663]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-03-
0 Atlanta organic 89424.11 207.08 89631.19 190257.38 1.70
25
2018-03-
1 Atlanta conventional 102717.50 153.00 102870.50 202790.74 1.75
18
df.tail
Return the last n rows. default 5
In [664]: avocado_data.tail()
Out[664]:
Small Large Total Total
Region Type AveragePrice Date
Bags Bags Bags Volume
2018-
5 California conventional 103033.73 186.20 106984.89 1203274.11 1.01
03-25
2018-
6 NewYork organic 119694.95 92.29 124214.59 777300.99 1.38
03-18
2018-
7 NewYork conventional 193813.92 196.57 197281.89 904333.98 1.29
03-11
2018-
8 SanFrancisco organic 231913.11 1286.43 236417.93 1051308.50 1.16
03-04
2018-
9 SanFrancisco conventional 162913.33 609.20 166836.16 984000.13 1.17
02-25
df.hist
Make a histogram of the DataFrame's columns.
In [665]: avocado_data.hist()
dtype=object)
df.idxmax
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 40/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
In [666]: avocado_data_num
Out[666]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Large Bags 8
Total Bags 8
Total Volume 5
AveragePrice 4
dtype: int64
1 Total Volume
2 Total Volume
3 Total Volume
4 Total Volume
5 Total Volume
6 Total Volume
7 Total Volume
8 Total Volume
9 Total Volume
dtype: object
df.idxmin
Return index of first occurrence of minimum over requested axis.
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 41/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
Large Bags 2
Total Bags 4
Total Volume 4
AveragePrice 5
dtype: int64
Out[670]: 0 AveragePrice
1 AveragePrice
2 AveragePrice
3 AveragePrice
4 AveragePrice
5 AveragePrice
6 AveragePrice
7 AveragePrice
8 AveragePrice
9 AveragePrice
dtype: object
df.iloc
Purely integer-location based indexing for selection by position.
In [671]: avocado_data_num.iloc[2:4,1:4]
Out[671]:
Large Bags Total Bags Total Volume
df.loc
Access a group of rows and columns by label(s).
In [672]: avocado_data_num
Out[672]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[673]:
Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[674]:
Small Bags Large Bags Total Bags
df.index
In [675]: avocado_data_num.index
In [676]: list(avocado_data_num.index)
Out[676]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
df.info
This method prints information about a DataFrame including
the index dtype and columns, non-null
values and memory usage.
In [677]: avocado_data_num.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
dtypes: float64(5)
df.insert
Insert column into DataFrame at specified location.
In [679]: avocado_data_num
Out[679]:
new_column Small Bags Large Bags Total Bags Total Volume AveragePrice
df.interpolate
Fill NaN values using an interpolation method.
``df.interpolate(method='polynomial', order=5)``.
* 'from_derivatives': Refers to
`scipy.interpolate.BPoly.from_derivatives` which
scipy 0.18.
Out[680]:
Year Country Winner GoalScored
Out[681]:
Year Country Winner GoalScored
Out[683]: 0 115.0
1 141.0
2 151.0
3 161.0
df.isin
Whether each element in the DataFrame is contained in values.
In [684]: df_dict
Out[684]:
Year Country Winner GoalScored
Out[685]:
Year Country Winner GoalScored
df.isna
Detect missing values. and gives boolean value True
In [686]: df_dict.isna()
Out[686]:
Year Country Winner GoalScored
df.isnull
Detect missing values.
In [688]: df_dict.isnull()
Out[688]:
Year Country Winner GoalScored
df.items
Iterate over (column name, Series) pairs.
In [689]: df_dict
Out[689]:
Year Country Winner GoalScored
Year
0 1990
1 1994
2 1998
3 2002
Country
0 Italy
1 USA
2 France
3 Japan
Winner
0 Germany
1 Brazil
2 France
3 Brazil
GoalScored
0 115.0
1 141.0
2 NaN
3 161.0
df.iteritems
Iterate over (column name, Series) pairs.
Year
0 1990
1 1994
2 1998
3 2002
---------
Country
0 Italy
1 USA
2 France
3 Japan
---------
Winner
0 Germany
1 Brazil
2 France
3 Brazil
---------
GoalScored
0 115.0
1 141.0
2 NaN
3 161.0
---------
df.iterrows
Iterate over DataFrame rows as (index, Series) pairs.
In [694]: df_dict
Out[694]:
Year Country Winner GoalScored
Year 1990
Country Italy
Winner Germany
GoalScored 115.0
---------
Year 1994
Country USA
Winner Brazil
GoalScored 141.0
---------
Year 1998
Country France
Winner France
GoalScored NaN
---------
Year 2002
Country Japan
Winner Brazil
GoalScored 161.0
---------
df.itertuples
Iterate over DataFrame rows as namedtuples.
In [698]: list(itertuples)
df.keys()
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 51/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
In [700]: df_dict
Out[700]:
Year Country Winner GoalScored
In [701]: df_dict.keys()
df.values
Return a Numpy representation of the DataFrame.
Only the values in the DataFrame will be
returned, the axes labels
will be removed.
In [702]: df_dict.values
df.kurt
Return unbiased kurtosis over requested axis.
In [703]: avocado_data_num.kurt()
AveragePrice -1.446656
dtype: float64
In [704]: avocado_data_num.kurtosis()
AveragePrice -1.446656
dtype: float64
df.skew
Return unbiased skew over requested axis.
axis : {index (0), columns (1)}
In [705]: avocado_data_num.skew()
AveragePrice -0.150918
dtype: float64
df.max
Return the maximum of the values over the requested axis.
In [706]: avocado_data_num.max()
AveragePrice 1.82
dtype: float64
df.min
Return the minimum of the values over the requested axis.
In [707]: avocado_data_num.min()
AveragePrice 1.01
dtype: float64
df.median
Return the median of the values over the requested axis.
In [708]: avocado_data_num.median()
AveragePrice 1.475
dtype: float64
df.std
Return sample standard deviation over requested axis.
{index (0), columns (1)}
In [709]: avocado_data_num.std()
AveragePrice 0.280240
dtype: float64
df.var
Return unbiased variance over requested axis.
{index (0), columns (1)}
In [710]: avocado_data_num.var()
AveragePrice 7.853444e-02
dtype: float64
df.melt
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
In [711]: avocado_data_num
Out[711]:
new_column Small Bags Large Bags Total Bags Total Volume AveragePrice
In [712]: avocado_data_num.melt()
Out[712]:
variable value
0 new_column 1.00
1 new_column 2.00
2 new_column 3.00
3 new_column 4.00
4 new_column 5.00
5 new_column 6.00
6 new_column 7.00
7 new_column 8.00
8 new_column 9.00
9 new_column 10.00
variable value
50 AveragePrice 1.70
51 AveragePrice 1.75
52 AveragePrice 1.58
53 AveragePrice 1.57
54 AveragePrice 1.82
55 AveragePrice 1.01
56 AveragePrice 1.38
57 AveragePrice 1.29
58 AveragePrice 1.16
59 AveragePrice 1.17
df.memory_usage
Return the memory usage of each column in bytes.
In [713]: df_dict.memory_usage()
Year 32
Country 32
Winner 32
GoalScored 32
dtype: int64
In [714]: avocado_data_num.memory_usage()
new_column 80
Small Bags 80
Large Bags 80
Total Bags 80
Total Volume 80
AveragePrice 80
dtype: int64
df.multiply
Get Multiplication of dataframe and other, element-wise (binary operator mul ).
In [715]: df_dict.multiply(2)
Out[715]:
Year Country Winner GoalScored
df.nunique
Count number of distinct elements in specified axis.
In [717]: df_dict.nunique()
Out[717]: Year 4
Country 4
Winner 3
GoalScored 3
dtype: int64
In [718]: df_dict
Out[718]:
Year Country Winner GoalScored
df.pivot_table
Create a spreadsheet-style pivot table as a DataFrame.
In [721]: df_dict.pivot_table(values=['GoalScored','Year'],index='Winner',aggfunc='mean')
Out[721]:
GoalScored Year
Winner
In [722]: df_dict.pivot_table(values=['GoalScored','Year'],index='Country',aggfunc='mean')
Out[722]:
GoalScored Year
Country
df.pop
Return item and drop from frame. Raise KeyError if not found.
In [723]: df_dict.pop('Year')
Out[723]: 0 1990
1 1994
2 1998
3 2002
In [724]: df_dict
Out[724]:
Country Winner GoalScored
df.rename
Rename column name
Out[725]:
country winner_country goal_scored
df.replace
Replace values given in to_replace with value .
Out[726]:
Country Winner GoalScored
Out[727]:
Country Winner GoalScored
df.reset_index
Reset the index of the DataFrame, and use the default one instead.
In [728]: df_dict.reset_index()
Out[728]:
index Country Winner GoalScored
df.sample
Return a random sample of items from an axis of object.
In [729]: df_dict.sample(n=4)
Out[729]:
Country Winner GoalScored
df.shape
Return a tuple representing the dimensionality of the DataFrame.
In [730]: df_dict
Out[730]:
Country Winner GoalScored
In [731]: df_dict.shape
Out[731]: (4, 3)
df.shape
Return an int representing the number of elements in this object.
In [732]: df_dict.size
Out[732]: 12
df.sort_index
Sort object by labels (along an axis).
In [733]: df_dict.sort_index(axis=0)
Out[733]:
Country Winner GoalScored
In [734]: df_dict.sort_index(axis=1)
Out[734]:
Country GoalScored Winner
df.sort_values
Sort by the values along either axis.
In [735]: df_dict.sort_values(by='Country')
Out[735]:
Country Winner GoalScored
In [736]: df_dict.sort_values(by='GoalScored')
Out[736]:
Country Winner GoalScored
In [737]: df_dict.sort_values(by=['Winner','GoalScored'])
Out[737]:
Country Winner GoalScored
df.to_clipboard
localhost:8888/notebooks/OneDrive/Desktop/Data Science/python/Pandas/Pandas Full Tutorial.ipynb 63/70
5/11/22, 9:50 PM Pandas Full Tutorial - Jupyter Notebook
In [738]: df_dict.to_clipboard()
In [739]: avocado_data_num.to_clipboard()
df.to_csv
Write object to a comma-separated values (csv) file.
In [458]: df_dict.to_csv('new_dict.csv')
df.to_dict
Convert the DataFrame to a dictionary.
In [457]: df_dict.to_dict()
df.to_excel
Write object to an Excel sheet.
In [459]: df_dict.to_excel('new_excel.xlsx')
df.to_html
Render a DataFrame as an HTML table.
In [462]: df_dict.to_html('dict_html.html')
df.to_json
Convert the object to a JSON string.
In [463]: df_dict.to_json('dict_json.json')
df.to_numpy
Convert the DataFrame to a NumPy array.
In [464]: df_dict.to_numpy()
df.to_parquet
Write a DataFrame to the binary parquet format.
In [466]: df_dict.to_parquet('parquet_file')
df.to_pickle
Pickle (serialize) object to file.
In [467]: df_dict.to_pickle('file.pkl')
df.transform
Call func on self producing a DataFrame with transformed values.
Out[740]:
Country Winner GoalScored
df.transpose()
Transpose index and columns.
In [741]: df_dict.transpose()
Out[741]:
0 1 2 3
In [742]: df_dict.T
Out[742]:
0 1 2 3
In [743]: df_dict
Out[743]:
Country Winner GoalScored
Out[744]:
Country Winner GoalScored
In [745]: avocado_data_num
Out[745]:
new_column Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[746]:
Small Bags Total Bags Total Volume
df.update
Modify in place using non-NA values from another DataFrame.
Out[747]:
A B
0 1 400
1 2 500
2 3 600
Out[748]:
B C
0 4 7
1 5 8
2 6 9
In [749]: df.update(new_df)
In [750]: df
Out[750]:
A B
0 1 4
1 2 5
2 3 6
df.value_counts
Return a Series containing counts of unique rows in the DataFrame.
In [753]: df_dict.value_counts()
dtype: int64
In [754]: df_dict['Winner'].value_counts()
Out[754]: Brazil 2
Germany 1
France 1
df.where
In [755]: df_dict
Out[755]:
Country Winner GoalScored
Out[756]:
new_column Small Bags Large Bags Total Bags Total Volume AveragePrice
Out[757]:
Country Winner GoalScored
Happy Learning