Pandas

Pandas
pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation
tool, built on top of the Python programming language.
Pandas composed of 3 different data structures:

Series [1 Dim]
DataFrame [2 Dim]
Panel [N-Dim][Combo of series and DataFrame ]
Series
In [1]: import pandas as pd
In [2]: # create an empty series

pd.Series()
C:\Users\hakim\AppData\Local\Temp/ipykernel_1572/3236373807.py:2: DeprecationWarnin
g: The default dtype for empty Series will be 'object' instead of 'float64' in a fut
ure version. Specify a dtype explicitly to silence this warning.
pd.Series()
Out[2]: Series([], dtype: float64)
In [3]: pd.Series(dtype='object')
Out[3]: Series([], dtype: object)
pd.Series(
data=None,
index=None,
dtype: 'Dtype | None' = None,
name=None,
copy: 'bool' = False,
fastpath: 'bool' = False,
)
Features of Series
In [5]: # It is cobination of 3 things
#- Data/values
#- Index
#- dtype
pd.Series([10,20,30,40])
Out[5]: 0 10
1 20
2 30
3 40
dtype: int64
In [6]: # It can accept homo. and hetro. data
# homo
pd.Series([12.3,4.5,6.8])
Out[6]: 0 12.3
1 4.5
2 6.8
dtype: float64
In [7]: pd.Series([10,20,19.0])
Out[7]: 0 10.0
1 20.0
2 19.0
dtype: float64
In [8]: # when we supply hetro. data or str data then dtype is object
pd.Series(['A',10,20,'30'])
Out[8]: 0 A
1 10
2 20
3 30
dtype: object
In [9]: # supply all str data

pd.Series(['A','B','C'])
Out[9]: 0 A
1 B
2 C
dtype: object
In [ ]: # Difference between int32(4 bytes) and int64(8 bytes)
In [10]: a = pd.Series([10,20,30,40,50])
a
Out[10]: 0 10
1 20
2 30
3 40
4 50
dtype: int64
In [11]: a.__sizeof__()
Out[11]: 168
In [12]: b = pd.Series([10,20,30,40,50],dtype='int8')
b
Out[12]: 0 10
1 20
2 30
3 40
4 50
dtype: int8
In [13]: b.__sizeof__()
Out[13]: 133
In [14]: # background data structure is an array

a
Out[14]: 0 10
1 20
2 30
3 40
4 50
dtype: int64
In [15]: # indexing is possible

# access 40
a[3]
Out[15]: 40
In [16]: a[-1] # in series -ve indexing is nt allowed
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(s
elf, key, method, tolerance)
384 try:
--> 385 return self._range.index(new_key)
386 except ValueError as err:
ValueError: -1 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)

~\AppData\Local\Temp/ipykernel_1572/2857362791.py in <module>
----> 1 a[-1] # in series -ve indexing is nt allowed
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(sel
f, key)
940
941 elif key_is_scalar:
--> 942 return self._get_value(key)
943
944 if is_hashable(key):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _get_value(self,
label, takeable)
1049
1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
1052 return self.index._get_values_for_loc(self, loc, label)
1053
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(s
elf, key, method, tolerance)
385 return self._range.index(new_key)
386 except ValueError as err:
--> 387 raise KeyError(key) from err
388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: -1
In [17]: # Is direct item assignment is possible??

a
Out[17]: 0 10
1 20
2 30
3 40
4 50
dtype: int64
In [18]: a[0] = 100

In [19]: a
Out[19]: 0 100
1 20
2 30
3 40
4 50
dtype: int64
In [20]: id(a)
Out[20]: 2479599958864
In [21]: a[4] = 999
In [22]: a
Out[22]: 0 100
1 20
2 30
3 40
4 999
dtype: int64
In [23]: id(a)
Out[23]: 2479599958864
# After the change,id doesnt change and changes persist in the same object
# hence Series is a mutable data struture
In [25]: # slicing supported

a[:]
Out[25]: 0 100
1 20
2 30
3 40
4 999
dtype: int64
In [26]: a[:2]
Out[26]: 0 100
1 20
dtype: int64
In [27]: #access 100,30,999

a[::2]
Out[27]: 0 100
2 30
4 999
dtype: int64
In [28]: # replace 20,30,40 by 2,3,4 resp.
a
Out[28]: 0 100
1 20
2 30
3 40
4 999
dtype: int64
In [29]: a[1:4]
Out[29]: 1 20
2 30
3 40
dtype: int64
In [30]: a[1:4] = [2,3,4]
In [31]: a
Out[31]: 0 100
1 2
2 3
3 4
4 999
dtype: int64
In [32]: # replace 100 and 999 by 0

a[::4]
Out[32]: 0 100
4 999
dtype: int64
In [33]: a[::4] = 0
In [34]: a
Out[34]: 0 0
1 2
2 3
3 4
4 0
dtype: int64
In [38]: a[0],a[4] = (100,100)
In [39]: a
Out[39]: 0 100
1 2
2 3
3 4
4 100
dtype: int64
In [42]: # if we want to add multiple values at same position then we cant add
a[0] = [10,20]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'li
st'

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __setitem__(sel
f, key, value)
1061 try:
-> 1062 self._set_with_engine(key, value)
1063 except (KeyError, ValueError):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _set_with_engine
(self, key, value)
1098 validate_numeric_casting(self.dtype, value) # type: ignore[arg-typ
e]
-> 1099 self._values[loc] = value
1100
ValueError: setting an array element with a sequence.
During handling of the above exception, another exception occurred:

TypeError: int() argument must be a string, a bytes-like object or a number, not 'li
st'

1 # if we want to add multiple values at same position then we cant add
----> 2 a[0] = [10,20]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __setitem__(sel
f, key, value)
1068 else:
1069 # GH#12862 adding a new key to the Series
-> 1070 self.loc[key] = value
1071
1072 except TypeError as err:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(se
lf, key, value)
721
722 iloc = self if self.name == "iloc" else self.obj.iloc
--> 723 iloc._setitem_with_indexer(indexer, value, self.name)
724
725 def _validate_key(self, key, axis: int):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_
indexer(self, indexer, value, name)
1730 self._setitem_with_indexer_split_path(indexer, value, name)
1731 else:
-> 1732 self._setitem_single_block(indexer, value, name)
1733
1734 def _setitem_with_indexer_split_path(self, indexer, value, name: str):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_singl
e_block(self, indexer, value, name)
1966
1967 # actually do the set
-> 1968 self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
1969 self.obj._maybe_update_cacher(clear=True)
1970
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in seti
tem(self, indexer, value)
353
354 def setitem(self: T, indexer, value) -> T:
--> 355 return self.apply("setitem", indexer=indexer, value=value)
356
357 def putmask(self, mask, new, align: bool = True):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in appl
y(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in setite
m(self, indexer, value)
951 # setting a single element for each dim and with a rhs that coul
d
952 # be e.g. a list; see GH#6043
--> 953 values[indexer] = value
954
955 elif exact_match and is_categorical_dtype(arr_value.dtype):
ValueError: setting an array element with a sequence.
In [45]: a[0] = bytes((1,2))
In [46]: a
Out[46]: 0 b'\x01\x02'
1 2
2 3
3 4
4 100
dtype: object
Manipulation of index
In [47]: import numpy as np

s = pd.Series(np.arange(101,111))
s
Out[47]: 0 101
1 102
2 103
3 104
4 105
5 106
6 107
7 108
8 109
9 110
dtype: int32
In [54]: # as we cant use -ve index then hwo to fetch last element
s.tail(1)
Out[54]: 9 110
dtype: int32
In [53]: s[len(s)-1]
Out[53]: 110
In [55]: s[len(s)-1:]
Out[55]: 9 110
dtype: int32
In [57]: # if we wnt to access only data

#returns an array of value
s.values
Out[57]: array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])
In [58]: np.array(s)
Out[58]: array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])
In [59]: #list
list(s)
Out[59]: [101, 102, 103, 104, 105, 106, 107, 108, 109, 110]
In [61]: #dict
print(dict(s))
{0: 101, 1: 102, 2: 103, 3: 104, 4: 105, 5: 106, 6: 107, 7: 108, 8: 109, 9: 110}
In [63]: # access index

s.index
Out[63]: RangeIndex(start=0, stop=10, step=1)
In [64]: # check dtype

s.dtype
Out[64]: dtype('int32')
In [65]: # series type

type(s)
Out[65]: pandas.core.series.Series
In [66]: # check dim

s.ndim
Out[66]: 1
In [67]: #check shape

s.shape
Out[67]: (10,)
In [75]: # now lets see Index manipulation
s = pd.Series(np.arange(101,111),index=range(10,20))
s
Out[75]: 10 101
11 102
12 103
13 104
14 105
15 106
16 107
17 108
18 109
19 110
dtype: int32
In [70]: s
Out[70]: 10 101
11 102
12 103
13 104
14 105
15 106
16 107
17 108
18 109
19 110
dtype: int32
In [ ]: # after creation of Series, change index

s.index = range(10)
In [73]: s
Out[73]: 0 101
1 102
2 103
3 104
4 105
5 106
6 107
7 108
8 109
9 110
dtype: int32
In [82]: # scalar series: 10 customers with same branch_name

b = pd.Series('SBI-Pune',index=[1,2,3,4,5,1,7,8,1,1])
b
Out[82]: 1 SBI-Pune
2 SBI-Pune
3 SBI-Pune
4 SBI-Pune
5 SBI-Pune
1 SBI-Pune
7 SBI-Pune
8 SBI-Pune
1 SBI-Pune
1 SBI-Pune
dtype: object
In [83]: # we have duplicate index
b[1]
Out[83]: 1 SBI-Pune
1 SBI-Pune
1 SBI-Pune
1 SBI-Pune
dtype: object
In [84]: pd.Series('SBI-Pune',index=['A','B',3,4,5,1,7,8,1,1])
Out[84]: A SBI-Pune
B SBI-Pune
3 SBI-Pune
4 SBI-Pune
5 SBI-Pune
1 SBI-Pune
7 SBI-Pune
8 SBI-Pune
1 SBI-Pune
1 SBI-Pune
dtype: object
In [85]: c = pd.Series(['SBI','SBI','BOI','SBI'])
c
Out[85]: 0 SBI
1 SBI
2 BOI
3 SBI
dtype: object
In [88]: # i dont want to access BOI using index

for i in c.str.find('BOI'):
#print(i)
if i == 0:
print()
-1
-1
0
-1
In [89]: c.str.startswith('B')
Out[89]: 0 False
1 False
2 True
3 False
dtype: bool
In [93]: #above boolean output we can supply as a n index to fetch values with True
c[c.str.startswith('B')]
Out[93]: 2 BOI
dtype: object
Lets create a series of a string and do the data analysis
In [ ]: n = pd.Series([])
n
to do analysis we need questions??
In [ ]: # fetch employee with inital letter P

In [ ]: # we can use boolean output as an input in index

In [ ]: # convert names in upper case

In [ ]: # sort in alphabatic order

In [ ]: #in descending order
In [ ]: # create a series which wil have length of each name

In [2]: s = pd.Series(['Viraj','Sushen','Tasmeen','Abhishek','Pallavi','Onkar','Sachin','Sanja
s
Out[2]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [3]: # check first 3 record

s.head(3)
Out[3]: 0 Viraj
1 Sushen
2 Tasmeen
dtype: object
In [4]: s[:3]
Out[4]: 0 Viraj
1 Sushen
2 Tasmeen
dtype: object
In [5]: # fetch last record

s.tail(1)
Out[5]: 7 Sanjay
dtype: object
In [7]: # plz sort the names in ascending order

s.sort_values()
Out[7]: 3 Abhishek
5 Onkar
4 Pallavi
6 Sachin
7 Sanjay
1 Sushen
2 Tasmeen
0 Viraj
dtype: object
In [8]: # in above case index is un ordered
s.sort_values(ignore_index=True)
Out[8]: 0 Abhishek
1 Onkar
2 Pallavi
3 Sachin
4 Sanjay
5 Sushen
6 Tasmeen
7 Viraj
dtype: object
In [10]: s.sort_values().sort_index()
Out[10]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [11]: print(dir(s))
['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_TO_AXIS_NUMBER', '_HANDL
ED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array
_priority__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__conta
ins__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir
__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__',
'__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstat
e__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__
imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipo
w__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lo
ng__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg_
_', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand_
_', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rm
atmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '_
_rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof_
_', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor_
_', '_accessors', '_accum_func', '_add_numeric_operations', '_agg_by_level', '_agg_e
xamples_doc', '_agg_see_also_doc', '_align_frame', '_align_series', '_arith_method',
'_as_manager', '_attrs', '_binop', '_can_hold_na', '_check_inplace_and_allows_duplic
ate_labels', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_ch
eck_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_wi
th_one_bound', '_clip_with_scalar', '_cmp_method', '_consolidate', '_consolidate_inp
lace', '_construct_axes_dict', '_construct_axes_from_arguments', '_construct_resul
t', '_constructor', '_constructor_expanddim', '_convert', '_convert_dtypes', '_dat
a', '_dir_additions', '_dir_deletions', '_drop_axis', '_drop_labels_or_levels', '_du
plicated', '_find_valid_index', '_flags', '_from_mgr', '_get_axis', '_get_axis_nam
e', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool
_data', '_get_cacher', '_get_cleaned_column_resolvers', '_get_index_resolvers', '_ge
t_label_or_level_values', '_get_numeric_data', '_get_value', '_get_values', '_get_va
lues_tuple', '_get_with', '_gotitem', '_hidden_attrs', '_index', '_indexed_same', '_
info_axis', '_info_axis_name', '_info_axis_number', '_init_dict', '_init_mgr', '_inp
lace_method', '_internal_names', '_internal_names_set', '_is_cached', '_is_copy', '_
is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mix
ed_type', '_is_view', '_item_cache', '_ixs', '_logical_func', '_logical_method', '_m
ap_values', '_maybe_update_cacher', '_memory_usage', '_metadata', '_mgr', '_min_coun
t_stat_function', '_name', '_needs_reindex_multi', '_protect_consolidate', '_reduc
e', '_reindex_axes', '_reindex_indexer', '_reindex_multi', '_reindex_with_indexers',
'_replace_single', '_repr_data_resource_', '_repr_latex_', '_reset_cache', '_reset_c
acher', '_set_as_cached', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_
is_copy', '_set_labels', '_set_name', '_set_value', '_set_values', '_set_with', '_se
t_with_engine', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_st
at_function', '_stat_function_ddof', '_take_with_is_copy', '_typ', '_update_inplac
e', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffi
x', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmi
n', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autoc
orr', 'axes', 'backfill', 'between', 'between_time', 'bfill', 'bool', 'clip', 'combi
ne', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'count', 'cov',
'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'divmo
d', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'dup
licated', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffil
l', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'ge
t', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc',
'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing',
'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items',
'iteritems', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'l
t', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mo
de', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnul
l', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'pro
d', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'r
eindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resam
ple', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsu
b', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'set_flags', 'shape',
'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'st
d', 'str', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_cl
ipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_late
x', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_st
ring', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate',
'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'value
s', 'var', 'view', 'where', 'xs']
In [16]: # convert series to csv file

s.to_csv('sample.csv',index=False)
In [14]: s
Out[14]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [17]: s2 = pd.Series(['Suraj','Sarika','Shavez','Sheela','Jyoti','Sandip'])
s2
Out[17]: 0 Suraj
1 Sarika
2 Shavez
3 Sheela
4 Jyoti
5 Sandip
dtype: object
In [18]: # lets combine series

s.append(s2)
Out[18]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
0 Suraj
1 Sarika
2 Shavez
3 Sheela
4 Jyoti
5 Sandip
dtype: object
In [19]: # to make proper indexing
s.append(s2,ignore_index=True)
Out[19]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
8 Suraj
9 Sarika
10 Shavez
11 Sheela
12 Jyoti
13 Sandip
dtype: object
In [21]: # apply() Invoke function on values of Series.

s.apply(len)
Out[21]: 0 5
1 6
2 7
3 8
4 7
5 5
6 6
7 6
dtype: int64
In [23]: # all names needed in upper case

s.apply(str.upper)
Out[23]: 0 VIRAJ
1 SUSHEN
2 TASMEEN
3 ABHISHEK
4 PALLAVI
5 ONKAR
6 SACHIN
7 SANJAY
dtype: object
In [27]: s.apply(lambda nm:'Mr.'+ nm)
Out[27]: 0 Mr.Viraj
1 Mr.Sushen
2 Mr.Tasmeen
3 Mr.Abhishek
4 Mr.Pallavi
5 Mr.Onkar
6 Mr.Sachin
7 Mr.Sanjay
dtype: object
In [28]: s
Out[28]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [31]: # findout people name ending with n

s.str.endswith('n')
Out[31]: 0 False
1 True
2 True
3 False
4 False
5 False
6 True
7 False
dtype: bool
In [32]: s[s.str.endswith('n')]
Out[32]: 1 Sushen
2 Tasmeen
6 Sachin
dtype: object
In [33]: #fetch name with length of name > 6
Out[33]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [35]: s.apply(len)>6
Out[35]: 0 False
1 False
2 True
3 True
4 True
5 False
6 False
7 False
dtype: bool
In [36]: s[s.apply(len)>6]
Out[36]: 2 Tasmeen
3 Abhishek
4 Pallavi
dtype: object
In [38]: s.sort_values()
Out[38]: 3 Abhishek
5 Onkar
4 Pallavi
6 Sachin
7 Sanjay
1 Sushen
2 Tasmeen
0 Viraj
dtype: object
In [39]: s.sort_values(ascending=False)
Out[39]: 0 Viraj
2 Tasmeen
1 Sushen
7 Sanjay
6 Sachin
4 Pallavi
5 Onkar
3 Abhishek
dtype: object
In [45]: # take names without any vowels

'Viraj'.replace('a','')
Out[45]: 'Virj'
In [47]: s.apply(map({'a':'','e':'','i':'','o':'','u':''}))
---------------------------------------------------------------------------
----> 1 s.apply(map({'a':'','e':'','i':'','o':'','u':''}))
TypeError: map() must have at least two arguments.
In [49]: s.map({'Viraj':1})
Out[49]: 0 1.0
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
dtype: float64
In [50]: s.apply(str.replace(i,'') if i in ['a','e','i','o','u'])
File "C:\Users\hakim\AppData\Local\Temp/ipykernel_4884/3447671120.py", line 1

s.apply(str.replace(i,'') if i in ['a','e','i','o','u'])
^
SyntaxError: invalid syntax
In [54]: for i in s:
for nm in i:
print(nm,end='')
print()
Viraj
Sushen
Tasmeen
Abhishek
Pallavi
Onkar
Sachin
Sanjay
In [58]: s.apply(lambda x:x.lower().replace('a','').replace('e','').replace('i','').replace('o
Out[58]: 0 vrj
1 sshn
2 tsmn
3 bhshk
4 pllv
5 nkr
6 schn
7 snjy
dtype: object
In [59]: s
Out[59]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [61]: # reverse each name

s.apply(lambda nm:nm[::-1])
Out[61]: 0 jariV
1 nehsuS
2 neemsaT
3 kehsihbA
4 ivallaP
5 raknO
6 nihcaS
7 yajnaS
dtype: object
In [67]: for i in s.apply(reversed):

print(''.join(list(i)))
jariV
nehsuS
neemsaT
kehsihbA
ivallaP
raknO
nihcaS
yajnaS
In [68]: s.apply(str.re)
Out[68]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [70]: #access alternate name

s[::2]
Out[70]: 0 Viraj
2 Tasmeen
4 Pallavi
6 Sachin
dtype: object
In [76]: s[::2]
Out[76]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [77]: g = pd.Series(['123-ABC','456-PQR'])
g
Out[77]: 0 123-ABC
1 456-PQR
dtype: object
In [78]: s.apply(lambda x:x[::-1])
Out[78]: 0 jariV
1 nehsuS
2 neemsaT
3 kehsihbA
4 ivallaP
5 raknO
6 nihcaS
7 yajnaS
dtype: object
In [82]: s.map({'Viraj':4764})
Out[82]: 0 4764.0
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
dtype: float64
In [ ]:
Series
In [ ]: #Series(data,index,dtype)
In [6]: d = [10,20,30,40]
ind = ['A','B','C','D']
pd.Series(data=d,index=ind)
Out[6]: A 10
B 20
C 30
D 40
dtype: int64
In [4]: # positional args and swaping the objects

pd.Series(ind,d)
Out[4]: 10 A
20 B
30 C
40 D
dtype: object
In [7]: # lets play with dtype

pd.Series(d) #default dtype is int64
Out[7]: 0 10
1 20
2 30
3 40
dtype: int64
In [8]: #lets change dtype

pd.Series(d,dtype='float')
Out[8]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float64
In [9]: pd.Series(d,dtype='float32')
Out[9]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float32
In [10]: pd.Series(d,dtype='f')
Out[10]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float32
In [13]: # float64
pd.Series(d,dtype='f8')
Out[13]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float64
In [19]: # using numpy

import numpy as np
pd.Series(d,dtype=np.float32)
Out[19]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float32

pd.Series(d,dtype=np.str_)
Out[21]: 0 10
1 20
2 30
3 40
dtype: object
In [22]: # if series is already created and i want to change dtype????

sf = pd.Series(d)
sf
Out[22]: 0 10
1 20
2 30
3 40
dtype: int64
In [23]: #convert dtype

sf.astype('str')
# astype is temp. -- it gives only conversion
Out[23]: 0 10
1 20
2 30
3 40
dtype: object
In [24]: sf.astype('object')
Out[24]: 0 10
1 20
2 30
3 40
dtype: object
In [25]: sf #original sf is unchanged
Out[25]: 0 10
1 20
2 30
3 40
dtype: int64
In [27]: print(dir(sf))
['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_TO_AXIS_NUMBER', '_HAN

DLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__a
rray_priority__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '_
_contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict_
_', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__
floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem
__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv_
_', '__imod__', '__imul__', '__init__', '__init_subclass__', '__int__', '__invert_
_', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__
le__', '__len__', '__long__', '__lt__', '__matmul__', '__mod__', '__module__', '__
mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow
__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr
__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round_
_', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem_
_', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__tru
ediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_add_numeric_oper
ations', '_agg_by_level', '_agg_examples_doc', '_agg_see_also_doc', '_align_fram
e', '_align_series', '_arith_method', '_as_manager', '_attrs', '_binop', '_can_hol
d_na', '_check_inplace_and_allows_duplicate_labels', '_check_inplace_setting', '_c
heck is chained assignment possible' ' check label or level ambiguity' ' check s
In [28]: sf
Out[28]: 0 10
1 20
2 30
3 40
dtype: int64
In [29]: sf.shape
Out[29]: (4,)
In [30]: sf.ndim
Out[30]: 1
DataFrame
# 2D data structure
# composed of rows and columns
# We can create multiple rows and multiple columns with different data types
# powerful option for data science and analysis
# it contains so many options for selection, filteration, merging, deletion..
# Dataframe is a combination of multiple series

In [32]: # Empty dataFrame
print(pd.DataFrame())
Empty DataFrame
Columns: []
Index: []
# Structure of Dataframe
pd.DataFrame()
"""
(data=None,
index: 'Axes | None' = None,
columns: 'Axes | None' = None,
dtype: 'Dtype | None' = None,
copy: 'bool | None' = None,
)
"""
Creation of df
# Using: list, tuple,set, dict, numpy,series...
In [33]: pd.DataFrame([10,20,30,40])
Out[33]: 0
0 10
1 20
2 30
3 40
In [35]: pd.DataFrame([10,20,30,40]).shape
Out[35]: (4, 1)
In [36]: pd.DataFrame([10,20,30,40]).ndim
Out[36]: 2
In [34]: pd.Series([10,20,30,40])
Out[34]: 0 10
1 20
2 30
3 40
dtype: int64
In [37]: pd.Series([10,20,30,40]).shape
Out[37]: (4,)
In [38]: pd.Series([10,20,30,40]).ndim
Out[38]: 1
In [39]: # list of list
k = [[1,2],[3,4]]
pd.DataFrame(k)
# each internal list becomes a row
Out[39]: 0 1
0 1 2
1 3 4
In [40]: # Lets change index and column name

pd.DataFrame(k,index=[101,102],columns=['Data_1','Data_2'])
Out[40]: Data_1 Data_2
101 1 2
102 3 4
In [42]: # positional arguments

#pd.DataFrame(k,[101,102],['Data_1','Data_2'])
pd.DataFrame(k,['Data_1','Data_2'],[101,201])
Out[42]: 101 201
Data_1 1 2
Data_2 3 4
# but if we interchange positions

#it will change output
In [43]: # create a df using a tuple

t = 7,8,9 # packing of data
pd.DataFrame(t)
Out[43]: 0
0 7
1 8
2 9
In [44]: 7,8,9
Out[44]: (7, 8, 9)
In [49]: # it accepts homo. Hetro values
#pd.DataFrame([10,20])
#pd.DataFrame([10,20,30,40.])
pd.DataFrame([10,'20','A','45',67,'B'])
Out[49]: 0
0 10
1 20
2 A
3 45
4 67
5 B
In [50]: # Using set

s = {(1,2,3),(1,2,5)}
# one tuple is one row
pd.DataFrame(s)
Out[50]: 0 1 2
0 1 2 3
1 1 2 5
In [51]: s = {2,19,10,1,0,99,4}
pd.DataFrame(s)
Out[51]: 0
0 0
1 1
2 2
3 99
4 4
5 19
6 10
In [57]: # Using Dict

d = {'name':['Ashok','Seema'],'age':[23,24],'place':['pune','sangli']}
pd.DataFrame(d)
# key become a column label
# values will be added with respect to keys
Out[57]: name age place
0 Ashok 23 pune
1 Seema 24 sangli
In [53]: d = {'name':['Ashok','Seema'],'age':[23,24,45],'place':['pune','sangli']}
pd.DataFrame(d)
---------------------------------------------------------------------------
1 d = {'name':['Ashok','Seema'],'age':[23,24,45],'place':['pune','sangli']}
----> 2 pd.DataFrame(d)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self,
data, index, columns, dtype, copy)
612 elif isinstance(data, dict):
613 # GH#38939 de facto copy defaults to False only in non-dict ca
ses
--> 614 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=cop
y, typ=manager)
615 elif isinstance(data, ma.MaskedArray):
616 import numpy.ma.mrecords as mrecords
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\construction.py i
n dict_to_mgr(data, index, columns, dtype, typ, copy)
462 # TODO: can we get rid of the dt64tz special case above?
463
In [62]: dg = pd.DataFrame(np.random.randint(20,45,5),columns=['age'])
dg
Out[62]: age
0 28
1 28
2 31
3 30
4 30
In [65]: # age = 28
dg[dg.age == 28]
Out[65]: age
0 28
1 28
In [66]: dg.query('age==28')
Out[66]: age
0 28
1 28
In [67]: # Using Series

s = pd.Series([120,300,500,750])
s
Out[67]: 0 120
1 300
2 500
3 750
dtype: int64
In [68]: pd.DataFrame(s)
Out[68]: 0
0 120
1 300
2 500
3 750
In [69]: # Using numpy

import numpy as np
n = np.random.random(5)
n
Out[69]: array([0.53627799, 0.81381906, 0.35778332, 0.75316441, 0.94354277])
In [70]: n = np.random.random((5,4))
n
Out[70]: array([[0.48607796, 0.68671019, 0.37005585, 0.18746466],

[0.8645104 , 0.19885713, 0.49144365, 0.15115642],
[0.45557155, 0.41497094, 0.18698425, 0.16966141],
[0.46457068, 0.06416644, 0.70635964, 0.9814973 ],
[0.42752716, 0.29029562, 0.72440363, 0.98770071]])
In [71]: pd.DataFrame(n)
Out[71]: 0 1 2 3
0 0.486078 0.686710 0.370056 0.187465
1 0.864510 0.198857 0.491444 0.151156
2 0.455572 0.414971 0.186984 0.169661
3 0.464571 0.064166 0.706360 0.981497
4 0.427527 0.290296 0.724404 0.987701
In [76]: # labeling the column

pd.DataFrame(n,columns=['a','v','c','d'])
Out[76]: a v c d
0 0.486078 0.686710 0.370056 0.187465
1 0.864510 0.198857 0.491444 0.151156
2 0.455572 0.414971 0.186984 0.169661
3 0.464571 0.064166 0.706360 0.981497
4 0.427527 0.290296 0.724404 0.987701

In [77]: #u may keep same column name
pd.DataFrame(n,columns=['a','v','c','a'])
Out[77]: a v c a
0 0.486078 0.686710 0.370056 0.187465
1 0.864510 0.198857 0.491444 0.151156
2 0.455572 0.414971 0.186984 0.169661
3 0.464571 0.064166 0.706360 0.981497
4 0.427527 0.290296 0.724404 0.987701
In [80]: g = pd.Series([10,20,30],[10,20,10])
g[10]
Out[80]: 10 10
10 30
dtype: int64
In [83]: # lets create a dataframe with different data types

y = pd.DataFrame({'Name':['A','V','D','A','F','A'],
'Age':[23,45,60,23,18,90],
'salary':[25.,45.,67.,66,55,89]})
y
Out[83]: Name Age salary
0 A 23 25.0
1 V 45 45.0
2 D 60 67.0
3 A 23 66.0
4 F 18 55.0
5 A 90 89.0
In [84]: y.dtypes
Out[84]: Name object

Age int64
salary float64
dtype: object
In [86]: # need a summary of dataframe

# return /Print a concise summary of a DataFrame.
y.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 6 non-null object
1 Age 6 non-null int64
2 salary 6 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes
In [87]: # Descriptive statistics
y.describe() #default works on numeric data
Out[87]: Age salary
count 6.000000 6.000000
mean 43.166667 57.833333
std 28.024394 21.784551
min 18.000000 25.000000
25% 23.000000 47.500000
50% 34.000000 60.500000
75% 56.250000 66.750000
max 90.000000 89.000000
In [88]: # if we want to check desciption of object column

y.describe(include='object')
Out[88]: Name
count 6
unique 4
top A
freq 3
In [91]: y.Name.unique()
Out[91]: array(['A', 'V', 'D', 'F'], dtype=object)
In [92]: y.Name.nunique()
Out[92]: 4
In [94]: y.Name.value_counts()
# it counts number of occurances of each category
Out[94]: A 3
V 1
D 1
F 1
Name: Name, dtype: int64
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
In [7]: # load dataset

#pd.read_csv(r'C:\Users\hakim\Downloads\ExportCSV.csv')
df = pd.read_csv('C:\\Users\hakim\Downloads\ExportCSV.csv')
df
Out[7]: FirstName New

Job Title Email Address Duration Country Empty
LastName column
5247-
Chadwick 1126-03-27 4084-
0 Cashier Chadwick_Gordon5732@acrit.org Gabon NaN
Gordon 18:41:06Z 7638-
0340
3722-
Healthcare 9490-11-24 3713-
1 Gil_Lindop9421@nanoff.biz Gil Lindop Germany NaN
Specialist 08:46:19Z 3784-
4667
2101-
7708-12-09 6418-
2 Audiologist Lillian_Burge7970@fuliss.net Lillian Burge Egypt NaN
09:02:17Z 4041-
2162
0720-
Cedrick 4121-06-08 6612-
3 Clerk Cedrick_Farrant4152@deons.tech France NaN
Farrant 07:54:11Z 6768-
4737
6625-
Healthcare Leslie 0893-05-29 7785-
4 Leslie_Wright2861@cispeto.com Seychelles NaN
Specialist Wright 10:23:50Z 7435-
0444
... ... ... ... ... ... ... ...
0532-
Lucy 4529-07-05 1318-
995 Webmaster Lucy_Whatson4266@zorer.org Barbados NaN
Whatson 19:49:54Z 2153-
2268
6810-
Healthcare 1296-08-27 Saudi 6415-
996 Rose_Kirby6758@gmail.com Rose Kirby NaN
Specialist 04:26:53Z Arabia 0575-
0738
4171-
Insurance 0704-08-08 2722-
997 Logan_Silva1028@grannar.com Logan Silva Singapore NaN
Broker 12:18:36Z 8456-
1171
7571-
HR 2871-01-09 Antigua and 3307-
998 Aileen_Wise351@tonsy.org Aileen Wise NaN
Specialist 15:19:54Z Barbuda 6622-
5084
7173-
HR Regina 1688-01-25 Equatorial 6042-
999 Regina_Grey6872@ovock.tech NaN
Coordinator Grey 06:36:18Z Guinea 6326-
1836
1000 rows × 7 columns
In [8]: # check column names

df.columns
Out[8]: Index(['Job Title', 'Email Address', 'FirstName LastName', 'Duration',

'Country', 'Empty', 'New column'],
dtype='object')
In [13]: # as we can see there is a need to change column names
# use rename function
# change Job Title
df.rename(columns={'Job Title':'JobTitle'},inplace= True)
In [14]: df.columns
Out[14]: Index(['JobTitle', 'Email Address', 'FirstName LastName', 'Duration',

'Country', 'Empty', 'New column'],
dtype='object')
In [15]: # 2nd apporach to change column

df.columns = ['JobTitle', 'EmailAddress', 'f_l_name', 'Duration',
'Country', 'Empty', 'Credit_card']
In [16]: df.columns
Out[16]: Index(['JobTitle', 'EmailAddress', 'f_l_name', 'Duration', 'Country', 'Empty',

'Credit_card'],
dtype='object')
In [10]: # Access single column using dot . operator

df.Country
Out[10]: 0 Gabon
1 Germany
2 Egypt
3 France
4 Seychelles
...
995 Barbados
996 Saudi Arabia
997 Singapore
998 Antigua and Barbuda
999 Equatorial Guinea
Name: Country, Length: 1000, dtype: object
In [20]: # another option to access column is

df['JobTitle']
Out[20]: 0 Cashier
1 Healthcare Specialist
2 Audiologist
3 Clerk
...
995 Webmaster
997 Insurance Broker
998 HR Specialist
999 HR Coordinator
Name: JobTitle, Length: 1000, dtype: object
In [22]: # if we want to access multiple columns
df[['JobTitle','Duration']]
Out[22]: JobTitle Duration
0 Cashier 1126-03-27 18:41:06Z
1 Healthcare Specialist 9490-11-24 08:46:19Z
2 Audiologist 7708-12-09 09:02:17Z
3 Clerk 4121-06-08 07:54:11Z
... ... ...
995 Webmaster 4529-07-05 19:49:54Z
997 Insurance Broker 0704-08-08 12:18:36Z
998 HR Specialist 2871-01-09 15:19:54Z
999 HR Coordinator 1688-01-25 06:36:18Z
# df. ==> series

# df[] ==> series
# df[[]] ==> DataFrame
now lets work on rows

In [27]: df.JobTitle # it gives all records
Out[27]: 0 Cashier
2 Audiologist
3 Clerk
...
995 Webmaster
997 Insurance Broker
998 HR Specialist
999 HR Coordinator
Name: JobTitle, Length: 1000, dtype: object
In [26]: # indexing over series

df.JobTitle[0]
Out[26]: 'Cashier'
In [31]: df.JobTitle.head(1) #returns series
Out[31]: 0 Cashier
Name: JobTitle, dtype: object
In [28]: # multitple elements using slicing
df.JobTitle[:10]
Out[28]: 0 Cashier
2 Audiologist
3 Clerk
5 Auditor
6 Cashier
7 CNC Operator
8 Staffing Consultant
9 Retail Trainee
Name: JobTitle, dtype: object
In [30]: # above output i want in Df

df[['JobTitle']][:10]
Out[30]: JobTitle
0 Cashier
2 Audiologist
3 Clerk
5 Auditor
6 Cashier
7 CNC Operator
8 Staffing Consultant
9 Retail Trainee
In [34]: df.Country[:5][::-1]
Out[34]: 4 Seychelles
3 France
2 Egypt
1 Germany
0 Gabon
Name: Country, dtype: object
In [37]: df.Country[3]
Out[37]: 'France'
In [38]: # replace france by India

df.Country[3] = 'India'
C:\Users\hakim\AppData\Local\Temp/ipykernel_8480/2533705342.py:2: SettingWithCopyWar
ning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

ser_guide/indexing.html#returning-a-view-versus-a-copy (https://pandas.pydata.org/pa
ndas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy)
df.Country[3] = 'India'
In [39]: df.Country
Out[39]: 0 Gabon
1 Germany
2 Egypt
3 India
4 Seychelles
...
995 Barbados
996 Saudi Arabia
997 Singapore
998 Antigua and Barbuda
999 Equatorial Guinea
Name: Country, Length: 1000, dtype: object
In [41]: # Access dataframe and 2-3 columns

df[['f_l_name','Country','JobTitle']]
Out[41]: f_l_name Country JobTitle
0 Chadwick Gordon Gabon Cashier
1 Gil Lindop Germany Healthcare Specialist
2 Lillian Burge Egypt Audiologist
3 Cedrick Farrant India Clerk
4 Leslie Wright Seychelles Healthcare Specialist
... ... ... ...
995 Lucy Whatson Barbados Webmaster
996 Rose Kirby Saudi Arabia Healthcare Specialist
997 Logan Silva Singapore Insurance Broker
998 Aileen Wise Antigua and Barbuda HR Specialist
999 Regina Grey Equatorial Guinea HR Coordinator

In [45]: # fetch 50% records
#df[['f_l_name','Country','JobTitle']][:500]
#df[['f_l_name','Country','JobTitle']].head(500)
#df[['f_l_name','Country','JobTitle']][::2]
#df[['f_l_name','Country','JobTitle']].tail(500)
500 Havana Gray Turkmenistan Systems Administrator
501 Leah Villiger Moldova Clerk
502 Matthew Addley Rwanda Baker
503 Stacy Benson Antigua and Barbuda Ambulatory Nurse
504 Liliana Baker Trinidad and Tobago Lecturer
... ... ... ...
995 Lucy Whatson Barbados Webmaster
996 Rose Kirby Saudi Arabia Healthcare Specialist
997 Logan Silva Singapore Insurance Broker
998 Aileen Wise Antigua and Barbuda HR Specialist
999 Regina Grey Equatorial Guinea HR Coordinator
Access the columns using loc(location) and iloc(int

loc)
loc: is label based selection/column names we can use for selection
purpose
In [51]: #df[:,:] it wont work on df

# solution??
df.loc[:,:] #[row,column]
Out[51]: JobTitle EmailAddress f_l_name Duration Country Empty Credit_card
1126-03-
Chadwick 5247-4084-
0 Cashier Chadwick_Gordon5732@acrit.org 27 Gabon NaN
Gordon 7638-0340
18:41:06Z
9490-11-
Healthcare 3722-3713-
1 Gil_Lindop9421@nanoff.biz Gil Lindop 24 Germany NaN
Specialist 3784-4667
08:46:19Z
7708-12-
Lillian 2101-6418-
2 Audiologist Lillian_Burge7970@fuliss.net 09 Egypt NaN
Burge 4041-2162
09:02:17Z
4121-06-
Cedrick 0720-6612-
3 Clerk Cedrick_Farrant4152@deons.tech 08 India NaN
Farrant 6768-4737
07:54:11Z
0893-05-
Healthcare Leslie 6625-7785-
4 Leslie_Wright2861@cispeto.com 29 Seychelles NaN
Specialist Wright 7435-0444
10:23:50Z
... ... ... ... ... ... ... ...
4529-07-
Lucy 0532-1318-
995 Webmaster Lucy_Whatson4266@zorer.org 05 Barbados NaN
Whatson 2153-2268
19:49:54Z
1296-08-
Healthcare Rose Saudi 6810-6415-
996 Rose_Kirby6758@gmail.com 27 NaN
Specialist Kirby Arabia 0575-0738
04:26:53Z
0704-08-
Insurance Logan 4171-2722-
997 Logan_Silva1028@grannar.com 08 Singapore NaN
Broker Silva 8456-1171
12:18:36Z
2871-01- Antigua
HR Aileen 7571-3307-
998 Aileen_Wise351@tonsy.org 09 and NaN
Specialist Wise 6622-5084
15:19:54Z Barbuda
1688-01-
HR Regina Equatorial 7173-6042-
999 Regina_Grey6872@ovock.tech 25 NaN
Coordinator Grey Guinea 6326-1836
06:36:18Z

In [52]: # select 10 rows
df.loc[:10,:]# [10 rows , all columns]
# here u can observe stop is inclusive
# hence if i want to access 10 rows then wil put 9
1126-03-
Chadwick 5247-4084-
Gordon 7638-0340
18:41:06Z
9490-11-
08:46:19Z
7708-12-
Lillian 2101-6418-
Burge 4041-2162
09:02:17Z
4121-06-
Cedrick 0720-6612-
Farrant 6768-4737
07:54:11Z
0893-05-
10:23:50Z
4330-03-
Johnathan 7668-5840-
5 Auditor Johnathan_Kelly6958@gompie.com 09 Malawi NaN
Kelly 4116-7784
14:17:00Z
6350-05-
Oliver 2527-6370-
6 Cashier Oliver_May393@grannar.com 22 Tajikistan NaN
May 6481-8305
11:10:26Z
7476-02-
CNC Mandy 1161-3378-
7 Mandy_Jefferson5919@infotech44.tech 09 Cyprus NaN
Operator Jefferson 8384-8763
00:24:15Z
7556-01-
Staffing Manuel 7234-7585-
8 Manuel_Aldridge5304@extex.org 02 Chad NaN
Consultant Aldridge 6765-4063
04:41:34Z
6818-06-
Retail Nicole Burkina 3447-7506-
9 Nicole_Vane2814@bretoux.com 03 NaN
Trainee Vane Faso 8463-8164
21:24:49Z
1843-06-
Healthcare Chester 5400-7711-
10 Chester_Wills2904@gmail.com 19 Denmark NaN
Specialist Wills 3768-0034
11:38:49Z
In [55]: # select 10-15 index row, first 2 columns

df.loc[10:15,'JobTitle':'EmailAddress']
Out[55]: JobTitle EmailAddress
10 Healthcare Specialist Chester_Wills2904@gmail.com
11 Mobile Developer Erick_Redwood6161@nanoff.biz
12 Investment Advisor Havana_Marshall9254@muall.tech
13 Global Logistics Supervisor Ethan_Blythe262@gembat.biz
14 Banker Jacob_Emmett4946@twace.org
15 Associate Professor Alessia_Hale9847@hourpy.biz

In [58]: # row 21:25 and column f_l_name,Country, Creditcard
df.loc[21:25,'f_l_name':'Credit_card':2]
Out[58]: f_l_name Country Credit_card
21 Alma Brennan Guyana 7063-8350-8578-5531
22 Barney Dempsey New Zealand 8807-4573-2724-5220
23 Angel Mackenzie Spain 5515-8700-4838-7870
24 Analise Turner Liberia 5631-0776-5646-1144
25 Britney Weasley Morocco 0484-4647-5288-1133
In [62]: # my requirement is to access random row: 10,20,55,90

# column: Duration,JobTitle,Credit_Card
df.loc[[10,20,55,90],['Duration','JobTitle','Credit_card']]
Out[62]: Duration JobTitle Credit_card
10 1843-06-19 11:38:49Z Healthcare Specialist 5400-7711-3768-0034
20 1233-10-14 01:22:33Z Pharmacist 5617-3685-4812-7221
55 1224-09-29 00:09:25Z Treasurer 2172-5454-3475-5308
90 2717-01-21 00:16:03Z Baker 5501-8051-5887-1838
In [64]: # select random 500 rows from df

df[['f_l_name','Country','JobTitle']].sample(3)
484 Stephanie Nanton Taiwan Food Technologist
754 Gabriel Lindop Tajikistan IT Support Staff
415 Rufus Vollans East Timor (Timor-Leste) Designer

In [65]: df.sample(frac=.5)
7641-01-
Rick 4528-2683-
560 Loan Officer Rick_Coates6953@bretoux.com 15 Congo NaN
Coates 1214-1305
19:23:28Z
3611-08-
Staffing Angel 5515-8700-
23 Angel_Mackenzie4135@mafthy.com 17 Spain NaN
Consultant Mackenzie 4838-7870
08:24:23Z
9194-03-
Systems Elena 8012-2427-
574 Elena_Saunders7388@bungar.biz 27 South Africa NaN
Administrator Saunders 8270-0606
14:10:32Z
4374-11-
Ramon 8575-6072-
706 Designer Ramon_Lane4884@iatim.tech 25 Honduras NaN
Lane 3601-6566
22:41:07Z
8621-02-
Staffing Ivy 0025-5546-
38 Ivy_Latham170@jiman.org 02 Latvia NaN
Consultant Latham 4858-4740
04:12:47Z
... ... ... ... ... ... ... ...
7831-02-
Grace Bosnia and 1363-8640-
695 Loan Officer Grace_Robertson5161@atink.com 18 NaN
Robertson Herzegovina 8405-3471
21:32:44Z
0855-01-
Skylar 3564-4045-
472 Bookkeeper Skylar_Murray4851@ovock.tech 17 Dominica NaN
Murray 5888-1845
15:20:02Z
2990-12-
Associate Carmella 3200-8678-
495 Carmella_Morris2386@atink.com 14 Germany NaN
Professor Morris 5731-0386
05:22:02Z
0893-05-
10:23:50Z
6794-05-
Health Josh 6275-7433-
170 Josh_Collins1896@infotech44.tech 31 Micronesia NaN
Educator Collins 7282-6214
19:02:02Z
In [66]: # it give summary of dataset

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 JobTitle 1000 non-null object
1 EmailAddress 1000 non-null object
2 f_l_name 1000 non-null object
3 Duration 1000 non-null object
4 Country 1000 non-null object
5 Empty 0 non-null float64
6 Credit_card 1000 non-null object
dtypes: float64(1), object(6)
memory usage: 54.8+ KB
# column contains name/label as well as index

iloc: integer/index based access
In [67]: df.iloc[:,:]
1126-03-
Chadwick 5247-4084-
Gordon 7638-0340
18:41:06Z
9490-11-
08:46:19Z
7708-12-
Lillian 2101-6418-
Burge 4041-2162
09:02:17Z
4121-06-
Cedrick 0720-6612-
Farrant 6768-4737
07:54:11Z
0893-05-
10:23:50Z
... ... ... ... ... ... ... ...
4529-07-
Lucy 0532-1318-
995 Webmaster Lucy_Whatson4266@zorer.org 05 Barbados NaN
Whatson 2153-2268
19:49:54Z
1296-08-
Healthcare Rose Saudi 6810-6415-
996 Rose_Kirby6758@gmail.com 27 NaN
Specialist Kirby Arabia 0575-0738
04:26:53Z
0704-08-
Insurance Logan 4171-2722-
997 Logan_Silva1028@grannar.com 08 Singapore NaN
Broker Silva 8456-1171
12:18:36Z
2871-01- Antigua
HR Aileen 7571-3307-
998 Aileen_Wise351@tonsy.org 09 and NaN
Specialist Wise 6622-5084
15:19:54Z Barbuda
1688-01-
HR Regina Equatorial 7173-6042-
999 Regina_Grey6872@ovock.tech 25 NaN
Coordinator Grey Guinea 6326-1836
06:36:18Z
In [68]: # work on rows

# incase of iloc stop is exclusive
df.iloc[:5,:]
Chadwick 1126-03-27 5247-4084-

0 Cashier Chadwick_Gordon5732@acrit.org Gabon NaN
Gordon 18:41:06Z 7638-0340
Healthcare 9490-11-24 3722-3713-

1 Gil_Lindop9421@nanoff.biz Gil Lindop Germany NaN
Specialist 08:46:19Z 3784-4667
Lillian 7708-12-09 2101-6418-

2 Audiologist Lillian_Burge7970@fuliss.net Egypt NaN
Burge 09:02:17Z 4041-2162
Cedrick 4121-06-08 0720-6612-

3 Clerk Cedrick_Farrant4152@deons.tech India NaN
Farrant 07:54:11Z 6768-4737
Healthcare Leslie 0893-05-29 6625-7785-

4 Leslie_Wright2861@cispeto.com Seychelles NaN
Specialist Wright 10:23:50Z 7435-0444
In [70]: #work on columns: 1st 3 columns
df.iloc[:5,:3] #stop is exclusive
Out[70]: JobTitle EmailAddress f_l_name
0 Cashier Chadwick_Gordon5732@acrit.org Chadwick Gordon
1 Healthcare Specialist Gil_Lindop9421@nanoff.biz Gil Lindop
2 Audiologist Lillian_Burge7970@fuliss.net Lillian Burge
3 Clerk Cedrick_Farrant4152@deons.tech Cedrick Farrant
4 Healthcare Specialist Leslie_Wright2861@cispeto.com Leslie Wright
In [72]: # access emailaddress and credi card

df.iloc[:,1::5]
Out[72]: EmailAddress Credit_card
0 Chadwick_Gordon5732@acrit.org 5247-4084-7638-0340
1 Gil_Lindop9421@nanoff.biz 3722-3713-3784-4667
2 Lillian_Burge7970@fuliss.net 2101-6418-4041-2162
3 Cedrick_Farrant4152@deons.tech 0720-6612-6768-4737
4 Leslie_Wright2861@cispeto.com 6625-7785-7435-0444
... ... ...
995 Lucy_Whatson4266@zorer.org 0532-1318-2153-2268
996 Rose_Kirby6758@gmail.com 6810-6415-0575-0738
997 Logan_Silva1028@grannar.com 4171-2722-8456-1171
998 Aileen_Wise351@tonsy.org 7571-3307-6622-5084
999 Regina_Grey6872@ovock.tech 7173-6042-6326-1836

In [73]: # reverse the columns
df.iloc[:,::-1]
Out[73]: Credit_card Empty Country Duration f_l_name EmailAddress JobTitle
1126-03-
5247-4084- Chadwick
0 NaN Gabon 27 Chadwick_Gordon5732@acrit.org Cashier
7638-0340 Gordon
18:41:06Z
9490-11-
3722-3713- Healthcare
1 NaN Germany 24 Gil Lindop Gil_Lindop9421@nanoff.biz
3784-4667 Specialist
08:46:19Z
7708-12-
2101-6418- Lillian
2 NaN Egypt 09 Lillian_Burge7970@fuliss.net Audiologist
4041-2162 Burge
09:02:17Z
4121-06-
0720-6612- Cedrick
3 NaN India 08 Cedrick_Farrant4152@deons.tech Clerk
6768-4737 Farrant
07:54:11Z
0893-05-
6625-7785- Leslie Healthcare
4 NaN Seychelles 29 Leslie_Wright2861@cispeto.com
7435-0444 Wright Specialist
10:23:50Z
... ... ... ... ... ... ... ...
4529-07-
0532-1318- Lucy
995 NaN Barbados 05 Lucy_Whatson4266@zorer.org Webmaster
2153-2268 Whatson
19:49:54Z
1296-08-
6810-6415- Saudi Rose Healthcare
996 NaN 27 Rose_Kirby6758@gmail.com
0575-0738 Arabia Kirby Specialist
04:26:53Z
0704-08-
4171-2722- Logan Insurance
997 NaN Singapore 08 Logan_Silva1028@grannar.com
8456-1171 Silva Broker
12:18:36Z
Antigua 2871-01-
7571-3307- Aileen HR
998 NaN and 09 Aileen_Wise351@tonsy.org
6622-5084 Wise Specialist
Barbuda 15:19:54Z
1688-01-
7173-6042- Equatorial Regina HR
999 NaN 25 Regina_Grey6872@ovock.tech
6326-1836 Guinea Grey Coordinator
06:36:18Z
In [75]: # access random columns

df.iloc[[0,34,98,100],[3,0,5]]
Out[75]: Duration JobTitle Empty
0 1126-03-27 18:41:06Z Cashier NaN
34 3152-07-22 14:41:53Z Paramedic NaN
98 8071-06-02 20:22:15Z Staffing Consultant NaN
100 0497-05-25 22:33:31Z Operator NaN

In [78]: # find out count titlewise
df.JobTitle.value_counts()
Out[78]: Healthcare Specialist 28

Staffing Consultant 26
Health Educator 25
Physician 24
Machine Operator 24
CNC Operator 23
Electrician 23
Associate Professor 23
Inspector 22
Web Developer 22
Pharmacist 21
Project Manager 21
Ambulatory Nurse 21
HR Coordinator 21
Webmaster 20
Paramedic 20
IT Support Staff 20
Global Logistics Supervisor 20
Doctor 20
Lecturer 19
Systems Administrator 19
Assistant Buyer 19
Stockbroker 19
Banker 18
Operator 18
Steward 18
Clerk 18
Dentist 17
Software Engineer 17
Cook 16
Service Supervisor 16
Bellman 16
Budget Analyst 15
Laboratory Technician 15
Baker 15
Cashier 14
Fabricator 14
Call Center Representative 14
Cash Manager 14
Business Broker 14
Investment Advisor 14
Design Engineer 14
Food Technologist 13
Loan Officer 13
Mobile Developer 12
Retail Trainee 12
Bookkeeper 12
Biologist 12
Audiologist 12
Auditor 12
Production Painter 11
Accountant 11
Designer 11
Executive Director 11
Front Desk Coordinator 11
Chef Manager 11
Treasurer 11
Insurance Broker 11
HR Specialist 11
Restaurant Manager 6
Name: JobTitle, dtype: int64
In [ ]:

Pandas

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Pandas

Uploaded by

Copyright:

Available Formats

Pandas

Pandas composed of 3 different data structures:

In [2]: # create an empty series

Out[2]: Series([], dtype: float64)

Out[3]: Series([], dtype: object)

In [9]: # supply all str data

In [ ]: # Difference between int32(4 bytes) and int64(8 bytes)

In [14]: # background data structure is an array

In [15]: # indexing is possible

ValueError: -1 is not in range

KeyError Traceback (most recent call last)

In [17]: # Is direct item assignment is possible??

In [18]: a[0] = 100

In [21]: a[4] = 999

In [25]: # slicing supported

In [27]: #access 100,30,999

In [30]: a[1:4] = [2,3,4]

In [32]: # replace 100 and 999 by 0

In [38]: a[0],a[4] = (100,100)

ValueError Traceback (most recent call last)

ValueError: setting an array element with a sequence.

During handling of the above exception, another exception occurred:

TypeError Traceback (most recent call last)

ValueError Traceback (most recent call last)

ValueError: setting an array element with a sequence.

In [45]: a[0] = bytes((1,2))

In [47]: import numpy as np

In [57]: # if we wnt to access only data

In [63]: # access index

Out[63]: RangeIndex(start=0, stop=10, step=1)

In [64]: # check dtype

In [65]: # series type

In [66]: # check dim

In [67]: #check shape

In [ ]: # after creation of Series, change index

In [82]: # scalar series: 10 customers with same branch_name

In [88]: # i dont want to access BOI using index

to do analysis we need questions??

In [ ]: # fetch employee with inital letter P

In [ ]: # we can use boolean output as an input in index

In [ ]: # convert names in upper case

In [ ]: # sort in alphabatic order

In [ ]: #in descending order

In [ ]: # create a series which wil have length of each name

In [3]: # check first 3 record

In [5]: # fetch last record

In [7]: # plz sort the names in ascending order

In [16]: # convert series to csv file

In [18]: # lets combine series

In [21]: # apply() Invoke function on values of Series.

In [23]: # all names needed in upper case

In [27]: s.apply(lambda nm:'Mr.'+ nm)

In [31]: # findout people name ending with n

In [33]: #fetch name with length of name > 6

In [45]: # take names without any vowels

TypeError: map() must have at least two arguments.

In [50]: s.apply(str.replace(i,'') if i in ['a','e','i','o','u'])

File "C:\Users\hakim\AppData\Local\Temp/ipykernel_4884/3447671120.py", line 1

In [58]: s.apply(lambda x:x.lower().replace('a','').replace('e','').replace('i','').replace('o

In [61]: # reverse each name

In [67]: for i in s.apply(reversed):

In [70]: #access alternate name

In [78]: s.apply(lambda x:x[::-1])