Professional Documents
Culture Documents
Pandas
Pandas
pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation
tool, built on top of the Python programming language.
Series
In [1]: import pandas as pd
C:\Users\hakim\AppData\Local\Temp/ipykernel_1572/3236373807.py:2: DeprecationWarnin
g: The default dtype for empty Series will be 'object' instead of 'float64' in a fut
ure version. Specify a dtype explicitly to silence this warning.
pd.Series()
In [3]: pd.Series(dtype='object')
pd.Series(
data=None,
index=None,
dtype: 'Dtype | None' = None,
name=None,
copy: 'bool' = False,
fastpath: 'bool' = False,
)
Features of Series
In [5]: # It is cobination of 3 things
#- Data/values
#- Index
#- dtype
pd.Series([10,20,30,40])
Out[5]: 0 10
1 20
2 30
3 40
dtype: int64
In [6]: # It can accept homo. and hetro. data
# homo
pd.Series([12.3,4.5,6.8])
Out[6]: 0 12.3
1 4.5
2 6.8
dtype: float64
In [7]: pd.Series([10,20,19.0])
Out[7]: 0 10.0
1 20.0
2 19.0
dtype: float64
In [8]: # when we supply hetro. data or str data then dtype is object
pd.Series(['A',10,20,'30'])
Out[8]: 0 A
1 10
2 20
3 30
dtype: object
Out[9]: 0 A
1 B
2 C
dtype: object
In [10]: a = pd.Series([10,20,30,40,50])
a
Out[10]: 0 10
1 20
2 30
3 40
4 50
dtype: int64
In [11]: a.__sizeof__()
Out[11]: 168
In [12]: b = pd.Series([10,20,30,40,50],dtype='int8')
b
Out[12]: 0 10
1 20
2 30
3 40
4 50
dtype: int8
In [13]: b.__sizeof__()
Out[13]: 133
Out[14]: 0 10
1 20
2 30
3 40
4 50
dtype: int64
Out[15]: 40
In [16]: a[-1] # in series -ve indexing is nt allowed
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(s
elf, key, method, tolerance)
384 try:
--> 385 return self._range.index(new_key)
386 except ValueError as err:
The above exception was the direct cause of the following exception:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(sel
f, key)
940
941 elif key_is_scalar:
--> 942 return self._get_value(key)
943
944 if is_hashable(key):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _get_value(self,
label, takeable)
1049
1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
1052 return self.index._get_values_for_loc(self, loc, label)
1053
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(s
elf, key, method, tolerance)
385 return self._range.index(new_key)
386 except ValueError as err:
--> 387 raise KeyError(key) from err
388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: -1
Out[17]: 0 10
1 20
2 30
3 40
4 50
dtype: int64
Out[19]: 0 100
1 20
2 30
3 40
4 50
dtype: int64
In [20]: id(a)
Out[20]: 2479599958864
In [22]: a
Out[22]: 0 100
1 20
2 30
3 40
4 999
dtype: int64
In [23]: id(a)
Out[23]: 2479599958864
# After the change,id doesnt change and changes persist in the same object
# hence Series is a mutable data struture
Out[25]: 0 100
1 20
2 30
3 40
4 999
dtype: int64
In [26]: a[:2]
Out[26]: 0 100
1 20
dtype: int64
Out[27]: 0 100
2 30
4 999
dtype: int64
In [28]: # replace 20,30,40 by 2,3,4 resp.
a
Out[28]: 0 100
1 20
2 30
3 40
4 999
dtype: int64
In [29]: a[1:4]
Out[29]: 1 20
2 30
3 40
dtype: int64
In [31]: a
Out[31]: 0 100
1 2
2 3
3 4
4 999
dtype: int64
Out[32]: 0 100
4 999
dtype: int64
In [33]: a[::4] = 0
In [34]: a
Out[34]: 0 0
1 2
2 3
3 4
4 0
dtype: int64
In [39]: a
Out[39]: 0 100
1 2
2 3
3 4
4 100
dtype: int64
In [42]: # if we want to add multiple values at same position then we cant add
a[0] = [10,20]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'li
st'
The above exception was the direct cause of the following exception:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _set_with_engine
(self, key, value)
1098 validate_numeric_casting(self.dtype, value) # type: ignore[arg-typ
e]
-> 1099 self._values[loc] = value
1100
The above exception was the direct cause of the following exception:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in __setitem__(sel
f, key, value)
1068 else:
1069 # GH#12862 adding a new key to the Series
-> 1070 self.loc[key] = value
1071
1072 except TypeError as err:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(se
lf, key, value)
721
722 iloc = self if self.name == "iloc" else self.obj.iloc
--> 723 iloc._setitem_with_indexer(indexer, value, self.name)
724
725 def _validate_key(self, key, axis: int):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_
indexer(self, indexer, value, name)
1730 self._setitem_with_indexer_split_path(indexer, value, name)
1731 else:
-> 1732 self._setitem_single_block(indexer, value, name)
1733
1734 def _setitem_with_indexer_split_path(self, indexer, value, name: str):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_singl
e_block(self, indexer, value, name)
1966
1967 # actually do the set
-> 1968 self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
1969 self.obj._maybe_update_cacher(clear=True)
1970
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in seti
tem(self, indexer, value)
353
354 def setitem(self: T, indexer, value) -> T:
--> 355 return self.apply("setitem", indexer=indexer, value=value)
356
357 def putmask(self, mask, new, align: bool = True):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in appl
y(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in setite
m(self, indexer, value)
951 # setting a single element for each dim and with a rhs that coul
d
952 # be e.g. a list; see GH#6043
--> 953 values[indexer] = value
954
955 elif exact_match and is_categorical_dtype(arr_value.dtype):
In [46]: a
Out[46]: 0 b'\x01\x02'
1 2
2 3
3 4
4 100
dtype: object
Manipulation of index
Out[47]: 0 101
1 102
2 103
3 104
4 105
5 106
6 107
7 108
8 109
9 110
dtype: int32
In [54]: # as we cant use -ve index then hwo to fetch last element
s.tail(1)
Out[54]: 9 110
dtype: int32
In [53]: s[len(s)-1]
Out[53]: 110
In [55]: s[len(s)-1:]
Out[55]: 9 110
dtype: int32
Out[57]: array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])
In [58]: np.array(s)
Out[58]: array([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])
In [59]: #list
list(s)
Out[59]: [101, 102, 103, 104, 105, 106, 107, 108, 109, 110]
In [61]: #dict
print(dict(s))
{0: 101, 1: 102, 2: 103, 3: 104, 4: 105, 5: 106, 6: 107, 7: 108, 8: 109, 9: 110}
Out[64]: dtype('int32')
Out[65]: pandas.core.series.Series
Out[66]: 1
Out[67]: (10,)
In [75]: # now lets see Index manipulation
s = pd.Series(np.arange(101,111),index=range(10,20))
s
Out[75]: 10 101
11 102
12 103
13 104
14 105
15 106
16 107
17 108
18 109
19 110
dtype: int32
In [70]: s
Out[70]: 10 101
11 102
12 103
13 104
14 105
15 106
16 107
17 108
18 109
19 110
dtype: int32
In [73]: s
Out[73]: 0 101
1 102
2 103
3 104
4 105
5 106
6 107
7 108
8 109
9 110
dtype: int32
Out[82]: 1 SBI-Pune
2 SBI-Pune
3 SBI-Pune
4 SBI-Pune
5 SBI-Pune
1 SBI-Pune
7 SBI-Pune
8 SBI-Pune
1 SBI-Pune
1 SBI-Pune
dtype: object
In [83]: # we have duplicate index
b[1]
Out[83]: 1 SBI-Pune
1 SBI-Pune
1 SBI-Pune
1 SBI-Pune
dtype: object
In [84]: pd.Series('SBI-Pune',index=['A','B',3,4,5,1,7,8,1,1])
Out[84]: A SBI-Pune
B SBI-Pune
3 SBI-Pune
4 SBI-Pune
5 SBI-Pune
1 SBI-Pune
7 SBI-Pune
8 SBI-Pune
1 SBI-Pune
1 SBI-Pune
dtype: object
In [85]: c = pd.Series(['SBI','SBI','BOI','SBI'])
c
Out[85]: 0 SBI
1 SBI
2 BOI
3 SBI
dtype: object
-1
-1
0
-1
In [89]: c.str.startswith('B')
Out[89]: 0 False
1 False
2 True
3 False
dtype: bool
In [93]: #above boolean output we can supply as a n index to fetch values with True
c[c.str.startswith('B')]
Out[93]: 2 BOI
dtype: object
Lets create a series of a string and do the data analysis
In [ ]: n = pd.Series([])
n
In [2]: s = pd.Series(['Viraj','Sushen','Tasmeen','Abhishek','Pallavi','Onkar','Sachin','Sanja
s
Out[2]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
Out[3]: 0 Viraj
1 Sushen
2 Tasmeen
dtype: object
In [4]: s[:3]
Out[4]: 0 Viraj
1 Sushen
2 Tasmeen
dtype: object
Out[5]: 7 Sanjay
dtype: object
Out[7]: 3 Abhishek
5 Onkar
4 Pallavi
6 Sachin
7 Sanjay
1 Sushen
2 Tasmeen
0 Viraj
dtype: object
In [8]: # in above case index is un ordered
s.sort_values(ignore_index=True)
Out[8]: 0 Abhishek
1 Onkar
2 Pallavi
3 Sachin
4 Sanjay
5 Sushen
6 Tasmeen
7 Viraj
dtype: object
In [10]: s.sort_values().sort_index()
Out[10]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [11]: print(dir(s))
['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_TO_AXIS_NUMBER', '_HANDL
ED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array
_priority__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__conta
ins__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir
__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__',
'__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstat
e__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__
imul__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipo
w__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lo
ng__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg_
_', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand_
_', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rm
atmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '_
_rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof_
_', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor_
_', '_accessors', '_accum_func', '_add_numeric_operations', '_agg_by_level', '_agg_e
xamples_doc', '_agg_see_also_doc', '_align_frame', '_align_series', '_arith_method',
'_as_manager', '_attrs', '_binop', '_can_hold_na', '_check_inplace_and_allows_duplic
ate_labels', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_ch
eck_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_wi
th_one_bound', '_clip_with_scalar', '_cmp_method', '_consolidate', '_consolidate_inp
lace', '_construct_axes_dict', '_construct_axes_from_arguments', '_construct_resul
t', '_constructor', '_constructor_expanddim', '_convert', '_convert_dtypes', '_dat
a', '_dir_additions', '_dir_deletions', '_drop_axis', '_drop_labels_or_levels', '_du
plicated', '_find_valid_index', '_flags', '_from_mgr', '_get_axis', '_get_axis_nam
e', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool
_data', '_get_cacher', '_get_cleaned_column_resolvers', '_get_index_resolvers', '_ge
t_label_or_level_values', '_get_numeric_data', '_get_value', '_get_values', '_get_va
lues_tuple', '_get_with', '_gotitem', '_hidden_attrs', '_index', '_indexed_same', '_
info_axis', '_info_axis_name', '_info_axis_number', '_init_dict', '_init_mgr', '_inp
lace_method', '_internal_names', '_internal_names_set', '_is_cached', '_is_copy', '_
is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mix
ed_type', '_is_view', '_item_cache', '_ixs', '_logical_func', '_logical_method', '_m
ap_values', '_maybe_update_cacher', '_memory_usage', '_metadata', '_mgr', '_min_coun
t_stat_function', '_name', '_needs_reindex_multi', '_protect_consolidate', '_reduc
e', '_reindex_axes', '_reindex_indexer', '_reindex_multi', '_reindex_with_indexers',
'_replace_single', '_repr_data_resource_', '_repr_latex_', '_reset_cache', '_reset_c
acher', '_set_as_cached', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_
is_copy', '_set_labels', '_set_name', '_set_value', '_set_values', '_set_with', '_se
t_with_engine', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_st
at_function', '_stat_function_ddof', '_take_with_is_copy', '_typ', '_update_inplac
e', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffi
x', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmi
n', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autoc
orr', 'axes', 'backfill', 'between', 'between_time', 'bfill', 'bool', 'clip', 'combi
ne', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'count', 'cov',
'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'divmo
d', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'dup
licated', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffil
l', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'ge', 'ge
t', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc',
'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing',
'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items',
'iteritems', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'l
t', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mo
de', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnul
l', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'pro
d', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'r
eindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resam
ple', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsu
b', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'set_flags', 'shape',
'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'st
d', 'str', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_cl
ipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_late
x', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_st
ring', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate',
'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'value
s', 'var', 'view', 'where', 'xs']
In [14]: s
Out[14]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [17]: s2 = pd.Series(['Suraj','Sarika','Shavez','Sheela','Jyoti','Sandip'])
s2
Out[17]: 0 Suraj
1 Sarika
2 Shavez
3 Sheela
4 Jyoti
5 Sandip
dtype: object
Out[18]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
0 Suraj
1 Sarika
2 Shavez
3 Sheela
4 Jyoti
5 Sandip
dtype: object
In [19]: # to make proper indexing
s.append(s2,ignore_index=True)
Out[19]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
8 Suraj
9 Sarika
10 Shavez
11 Sheela
12 Jyoti
13 Sandip
dtype: object
Out[21]: 0 5
1 6
2 7
3 8
4 7
5 5
6 6
7 6
dtype: int64
Out[23]: 0 VIRAJ
1 SUSHEN
2 TASMEEN
3 ABHISHEK
4 PALLAVI
5 ONKAR
6 SACHIN
7 SANJAY
dtype: object
Out[27]: 0 Mr.Viraj
1 Mr.Sushen
2 Mr.Tasmeen
3 Mr.Abhishek
4 Mr.Pallavi
5 Mr.Onkar
6 Mr.Sachin
7 Mr.Sanjay
dtype: object
In [28]: s
Out[28]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
Out[31]: 0 False
1 True
2 True
3 False
4 False
5 False
6 True
7 False
dtype: bool
In [32]: s[s.str.endswith('n')]
Out[32]: 1 Sushen
2 Tasmeen
6 Sachin
dtype: object
Out[33]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [35]: s.apply(len)>6
Out[35]: 0 False
1 False
2 True
3 True
4 True
5 False
6 False
7 False
dtype: bool
In [36]: s[s.apply(len)>6]
Out[36]: 2 Tasmeen
3 Abhishek
4 Pallavi
dtype: object
In [38]: s.sort_values()
Out[38]: 3 Abhishek
5 Onkar
4 Pallavi
6 Sachin
7 Sanjay
1 Sushen
2 Tasmeen
0 Viraj
dtype: object
In [39]: s.sort_values(ascending=False)
Out[39]: 0 Viraj
2 Tasmeen
1 Sushen
7 Sanjay
6 Sachin
4 Pallavi
5 Onkar
3 Abhishek
dtype: object
Out[45]: 'Virj'
In [47]: s.apply(map({'a':'','e':'','i':'','o':'','u':''}))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_4884/3589023292.py in <module>
----> 1 s.apply(map({'a':'','e':'','i':'','o':'','u':''}))
In [49]: s.map({'Viraj':1})
Out[49]: 0 1.0
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
dtype: float64
Viraj
Sushen
Tasmeen
Abhishek
Pallavi
Onkar
Sachin
Sanjay
Out[58]: 0 vrj
1 sshn
2 tsmn
3 bhshk
4 pllv
5 nkr
6 schn
7 snjy
dtype: object
In [59]: s
Out[59]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
Out[61]: 0 jariV
1 nehsuS
2 neemsaT
3 kehsihbA
4 ivallaP
5 raknO
6 nihcaS
7 yajnaS
dtype: object
jariV
nehsuS
neemsaT
kehsihbA
ivallaP
raknO
nihcaS
yajnaS
In [68]: s.apply(str.re)
Out[68]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
Out[70]: 0 Viraj
2 Tasmeen
4 Pallavi
6 Sachin
dtype: object
In [76]: s[::2]
Out[76]: 0 Viraj
1 Sushen
2 Tasmeen
3 Abhishek
4 Pallavi
5 Onkar
6 Sachin
7 Sanjay
dtype: object
In [77]: g = pd.Series(['123-ABC','456-PQR'])
g
Out[77]: 0 123-ABC
1 456-PQR
dtype: object
Out[78]: 0 jariV
1 nehsuS
2 neemsaT
3 kehsihbA
4 ivallaP
5 raknO
6 nihcaS
7 yajnaS
dtype: object
In [82]: s.map({'Viraj':4764})
Out[82]: 0 4764.0
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
dtype: float64
In [ ]:
Series
In [1]: import pandas as pd
In [ ]: #Series(data,index,dtype)
In [6]: d = [10,20,30,40]
ind = ['A','B','C','D']
pd.Series(data=d,index=ind)
Out[6]: A 10
B 20
C 30
D 40
dtype: int64
Out[4]: 10 A
20 B
30 C
40 D
dtype: object
Out[7]: 0 10
1 20
2 30
3 40
dtype: int64
Out[8]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float64
In [9]: pd.Series(d,dtype='float32')
Out[9]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float32
In [10]: pd.Series(d,dtype='f')
Out[10]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float32
In [13]: # float64
pd.Series(d,dtype='f8')
Out[13]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float64
Out[19]: 0 10.0
1 20.0
2 30.0
3 40.0
dtype: float32
Out[21]: 0 10
1 20
2 30
3 40
dtype: object
Out[22]: 0 10
1 20
2 30
3 40
dtype: int64
Out[23]: 0 10
1 20
2 30
3 40
dtype: object
In [24]: sf.astype('object')
Out[24]: 0 10
1 20
2 30
3 40
dtype: object
In [25]: sf #original sf is unchanged
Out[25]: 0 10
1 20
2 30
3 40
dtype: int64
In [27]: print(dir(sf))
Out[28]: 0 10
1 20
2 30
3 40
dtype: int64
In [29]: sf.shape
Out[29]: (4,)
In [30]: sf.ndim
Out[30]: 1
DataFrame
# 2D data structure
# composed of rows and columns
# We can create multiple rows and multiple columns with different data types
# powerful option for data science and analysis
# it contains so many options for selection, filteration, merging, deletion..
# Dataframe is a combination of multiple series
Empty DataFrame
Columns: []
Index: []
# Structure of Dataframe
pd.DataFrame()
"""
(data=None,
index: 'Axes | None' = None,
columns: 'Axes | None' = None,
dtype: 'Dtype | None' = None,
copy: 'bool | None' = None,
)
"""
Creation of df
# Using: list, tuple,set, dict, numpy,series...
In [33]: pd.DataFrame([10,20,30,40])
Out[33]: 0
0 10
1 20
2 30
3 40
In [35]: pd.DataFrame([10,20,30,40]).shape
Out[35]: (4, 1)
In [36]: pd.DataFrame([10,20,30,40]).ndim
Out[36]: 2
In [34]: pd.Series([10,20,30,40])
Out[34]: 0 10
1 20
2 30
3 40
dtype: int64
In [37]: pd.Series([10,20,30,40]).shape
Out[37]: (4,)
In [38]: pd.Series([10,20,30,40]).ndim
Out[38]: 1
In [39]: # list of list
k = [[1,2],[3,4]]
pd.DataFrame(k)
# each internal list becomes a row
Out[39]: 0 1
0 1 2
1 3 4
101 1 2
102 3 4
Data_1 1 2
Data_2 3 4
Out[43]: 0
0 7
1 8
2 9
In [44]: 7,8,9
Out[44]: (7, 8, 9)
In [49]: # it accepts homo. Hetro values
#pd.DataFrame([10,20])
#pd.DataFrame([10,20,30,40.])
pd.DataFrame([10,'20','A','45',67,'B'])
Out[49]: 0
0 10
1 20
2 A
3 45
4 67
5 B
Out[50]: 0 1 2
0 1 2 3
1 1 2 5
In [51]: s = {2,19,10,1,0,99,4}
pd.DataFrame(s)
Out[51]: 0
0 0
1 1
2 2
3 99
4 4
5 19
6 10
0 Ashok 23 pune
1 Seema 24 sangli
In [53]: d = {'name':['Ashok','Seema'],'age':[23,24,45],'place':['pune','sangli']}
pd.DataFrame(d)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7584/3868696099.py in <module>
1 d = {'name':['Ashok','Seema'],'age':[23,24,45],'place':['pune','sangli']}
----> 2 pd.DataFrame(d)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self,
data, index, columns, dtype, copy)
612 elif isinstance(data, dict):
613 # GH#38939 de facto copy defaults to False only in non-dict ca
ses
--> 614 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=cop
y, typ=manager)
615 elif isinstance(data, ma.MaskedArray):
616 import numpy.ma.mrecords as mrecords
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\construction.py i
n dict_to_mgr(data, index, columns, dtype, typ, copy)
462 # TODO: can we get rid of the dt64tz special case above?
463
In [62]: dg = pd.DataFrame(np.random.randint(20,45,5),columns=['age'])
dg
Out[62]: age
0 28
1 28
2 31
3 30
4 30
In [65]: # age = 28
dg[dg.age == 28]
Out[65]: age
0 28
1 28
In [66]: dg.query('age==28')
Out[66]: age
0 28
1 28
Out[67]: 0 120
1 300
2 500
3 750
dtype: int64
In [68]: pd.DataFrame(s)
Out[68]: 0
0 120
1 300
2 500
3 750
In [70]: n = np.random.random((5,4))
n
In [71]: pd.DataFrame(n)
Out[71]: 0 1 2 3
Out[76]: a v c d
Out[77]: a v c a
In [80]: g = pd.Series([10,20,30],[10,20,10])
g[10]
Out[80]: 10 10
10 30
dtype: int64
0 A 23 25.0
1 V 45 45.0
2 D 60 67.0
3 A 23 66.0
4 F 18 55.0
5 A 90 89.0
In [84]: y.dtypes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 6 non-null object
1 Age 6 non-null int64
2 salary 6 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes
In [87]: # Descriptive statistics
y.describe() #default works on numeric data
Out[88]: Name
count 6
unique 4
top A
freq 3
In [91]: y.Name.unique()
In [92]: y.Name.nunique()
Out[92]: 4
In [94]: y.Name.value_counts()
# it counts number of occurances of each category
Out[94]: A 3
V 1
D 1
F 1
Name: Name, dtype: int64
In [ ]:
In [1]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
5247-
Chadwick 1126-03-27 4084-
0 Cashier Chadwick_Gordon5732@acrit.org Gabon NaN
Gordon 18:41:06Z 7638-
0340
3722-
Healthcare 9490-11-24 3713-
1 Gil_Lindop9421@nanoff.biz Gil Lindop Germany NaN
Specialist 08:46:19Z 3784-
4667
2101-
7708-12-09 6418-
2 Audiologist Lillian_Burge7970@fuliss.net Lillian Burge Egypt NaN
09:02:17Z 4041-
2162
0720-
Cedrick 4121-06-08 6612-
3 Clerk Cedrick_Farrant4152@deons.tech France NaN
Farrant 07:54:11Z 6768-
4737
6625-
Healthcare Leslie 0893-05-29 7785-
4 Leslie_Wright2861@cispeto.com Seychelles NaN
Specialist Wright 10:23:50Z 7435-
0444
0532-
Lucy 4529-07-05 1318-
995 Webmaster Lucy_Whatson4266@zorer.org Barbados NaN
Whatson 19:49:54Z 2153-
2268
6810-
Healthcare 1296-08-27 Saudi 6415-
996 Rose_Kirby6758@gmail.com Rose Kirby NaN
Specialist 04:26:53Z Arabia 0575-
0738
4171-
Insurance 0704-08-08 2722-
997 Logan_Silva1028@grannar.com Logan Silva Singapore NaN
Broker 12:18:36Z 8456-
1171
7571-
HR 2871-01-09 Antigua and 3307-
998 Aileen_Wise351@tonsy.org Aileen Wise NaN
Specialist 15:19:54Z Barbuda 6622-
5084
7173-
HR Regina 1688-01-25 Equatorial 6042-
999 Regina_Grey6872@ovock.tech NaN
Coordinator Grey 06:36:18Z Guinea 6326-
1836
In [14]: df.columns
In [16]: df.columns
Out[10]: 0 Gabon
1 Germany
2 Egypt
3 France
4 Seychelles
...
995 Barbados
996 Saudi Arabia
997 Singapore
998 Antigua and Barbuda
999 Equatorial Guinea
Name: Country, Length: 1000, dtype: object
Out[20]: 0 Cashier
1 Healthcare Specialist
2 Audiologist
3 Clerk
4 Healthcare Specialist
...
995 Webmaster
996 Healthcare Specialist
997 Insurance Broker
998 HR Specialist
999 HR Coordinator
Name: JobTitle, Length: 1000, dtype: object
In [22]: # if we want to access multiple columns
df[['JobTitle','Duration']]
Out[27]: 0 Cashier
1 Healthcare Specialist
2 Audiologist
3 Clerk
4 Healthcare Specialist
...
995 Webmaster
996 Healthcare Specialist
997 Insurance Broker
998 HR Specialist
999 HR Coordinator
Name: JobTitle, Length: 1000, dtype: object
Out[26]: 'Cashier'
Out[31]: 0 Cashier
Name: JobTitle, dtype: object
In [28]: # multitple elements using slicing
df.JobTitle[:10]
Out[28]: 0 Cashier
1 Healthcare Specialist
2 Audiologist
3 Clerk
4 Healthcare Specialist
5 Auditor
6 Cashier
7 CNC Operator
8 Staffing Consultant
9 Retail Trainee
Name: JobTitle, dtype: object
Out[30]: JobTitle
0 Cashier
1 Healthcare Specialist
2 Audiologist
3 Clerk
4 Healthcare Specialist
5 Auditor
6 Cashier
7 CNC Operator
8 Staffing Consultant
9 Retail Trainee
In [34]: df.Country[:5][::-1]
Out[34]: 4 Seychelles
3 France
2 Egypt
1 Germany
0 Gabon
Name: Country, dtype: object
In [37]: df.Country[3]
Out[37]: 'France'
C:\Users\hakim\AppData\Local\Temp/ipykernel_8480/2533705342.py:2: SettingWithCopyWar
ning:
A value is trying to be set on a copy of a slice from a DataFrame
Out[39]: 0 Gabon
1 Germany
2 Egypt
3 India
4 Seychelles
...
995 Barbados
996 Saudi Arabia
997 Singapore
998 Antigua and Barbuda
999 Equatorial Guinea
Name: Country, Length: 1000, dtype: object
1126-03-
Chadwick 5247-4084-
0 Cashier Chadwick_Gordon5732@acrit.org 27 Gabon NaN
Gordon 7638-0340
18:41:06Z
9490-11-
Healthcare 3722-3713-
1 Gil_Lindop9421@nanoff.biz Gil Lindop 24 Germany NaN
Specialist 3784-4667
08:46:19Z
7708-12-
Lillian 2101-6418-
2 Audiologist Lillian_Burge7970@fuliss.net 09 Egypt NaN
Burge 4041-2162
09:02:17Z
4121-06-
Cedrick 0720-6612-
3 Clerk Cedrick_Farrant4152@deons.tech 08 India NaN
Farrant 6768-4737
07:54:11Z
0893-05-
Healthcare Leslie 6625-7785-
4 Leslie_Wright2861@cispeto.com 29 Seychelles NaN
Specialist Wright 7435-0444
10:23:50Z
4529-07-
Lucy 0532-1318-
995 Webmaster Lucy_Whatson4266@zorer.org 05 Barbados NaN
Whatson 2153-2268
19:49:54Z
1296-08-
Healthcare Rose Saudi 6810-6415-
996 Rose_Kirby6758@gmail.com 27 NaN
Specialist Kirby Arabia 0575-0738
04:26:53Z
0704-08-
Insurance Logan 4171-2722-
997 Logan_Silva1028@grannar.com 08 Singapore NaN
Broker Silva 8456-1171
12:18:36Z
2871-01- Antigua
HR Aileen 7571-3307-
998 Aileen_Wise351@tonsy.org 09 and NaN
Specialist Wise 6622-5084
15:19:54Z Barbuda
1688-01-
HR Regina Equatorial 7173-6042-
999 Regina_Grey6872@ovock.tech 25 NaN
Coordinator Grey Guinea 6326-1836
06:36:18Z
1126-03-
Chadwick 5247-4084-
0 Cashier Chadwick_Gordon5732@acrit.org 27 Gabon NaN
Gordon 7638-0340
18:41:06Z
9490-11-
Healthcare 3722-3713-
1 Gil_Lindop9421@nanoff.biz Gil Lindop 24 Germany NaN
Specialist 3784-4667
08:46:19Z
7708-12-
Lillian 2101-6418-
2 Audiologist Lillian_Burge7970@fuliss.net 09 Egypt NaN
Burge 4041-2162
09:02:17Z
4121-06-
Cedrick 0720-6612-
3 Clerk Cedrick_Farrant4152@deons.tech 08 India NaN
Farrant 6768-4737
07:54:11Z
0893-05-
Healthcare Leslie 6625-7785-
4 Leslie_Wright2861@cispeto.com 29 Seychelles NaN
Specialist Wright 7435-0444
10:23:50Z
4330-03-
Johnathan 7668-5840-
5 Auditor Johnathan_Kelly6958@gompie.com 09 Malawi NaN
Kelly 4116-7784
14:17:00Z
6350-05-
Oliver 2527-6370-
6 Cashier Oliver_May393@grannar.com 22 Tajikistan NaN
May 6481-8305
11:10:26Z
7476-02-
CNC Mandy 1161-3378-
7 Mandy_Jefferson5919@infotech44.tech 09 Cyprus NaN
Operator Jefferson 8384-8763
00:24:15Z
7556-01-
Staffing Manuel 7234-7585-
8 Manuel_Aldridge5304@extex.org 02 Chad NaN
Consultant Aldridge 6765-4063
04:41:34Z
6818-06-
Retail Nicole Burkina 3447-7506-
9 Nicole_Vane2814@bretoux.com 03 NaN
Trainee Vane Faso 8463-8164
21:24:49Z
1843-06-
Healthcare Chester 5400-7711-
10 Chester_Wills2904@gmail.com 19 Denmark NaN
Specialist Wills 3768-0034
11:38:49Z
14 Banker Jacob_Emmett4946@twace.org
7641-01-
Rick 4528-2683-
560 Loan Officer Rick_Coates6953@bretoux.com 15 Congo NaN
Coates 1214-1305
19:23:28Z
3611-08-
Staffing Angel 5515-8700-
23 Angel_Mackenzie4135@mafthy.com 17 Spain NaN
Consultant Mackenzie 4838-7870
08:24:23Z
9194-03-
Systems Elena 8012-2427-
574 Elena_Saunders7388@bungar.biz 27 South Africa NaN
Administrator Saunders 8270-0606
14:10:32Z
4374-11-
Ramon 8575-6072-
706 Designer Ramon_Lane4884@iatim.tech 25 Honduras NaN
Lane 3601-6566
22:41:07Z
8621-02-
Staffing Ivy 0025-5546-
38 Ivy_Latham170@jiman.org 02 Latvia NaN
Consultant Latham 4858-4740
04:12:47Z
7831-02-
Grace Bosnia and 1363-8640-
695 Loan Officer Grace_Robertson5161@atink.com 18 NaN
Robertson Herzegovina 8405-3471
21:32:44Z
0855-01-
Skylar 3564-4045-
472 Bookkeeper Skylar_Murray4851@ovock.tech 17 Dominica NaN
Murray 5888-1845
15:20:02Z
2990-12-
Associate Carmella 3200-8678-
495 Carmella_Morris2386@atink.com 14 Germany NaN
Professor Morris 5731-0386
05:22:02Z
0893-05-
Healthcare Leslie 6625-7785-
4 Leslie_Wright2861@cispeto.com 29 Seychelles NaN
Specialist Wright 7435-0444
10:23:50Z
6794-05-
Health Josh 6275-7433-
170 Josh_Collins1896@infotech44.tech 31 Micronesia NaN
Educator Collins 7282-6214
19:02:02Z
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 JobTitle 1000 non-null object
1 EmailAddress 1000 non-null object
2 f_l_name 1000 non-null object
3 Duration 1000 non-null object
4 Country 1000 non-null object
5 Empty 0 non-null float64
6 Credit_card 1000 non-null object
dtypes: float64(1), object(6)
memory usage: 54.8+ KB
1126-03-
Chadwick 5247-4084-
0 Cashier Chadwick_Gordon5732@acrit.org 27 Gabon NaN
Gordon 7638-0340
18:41:06Z
9490-11-
Healthcare 3722-3713-
1 Gil_Lindop9421@nanoff.biz Gil Lindop 24 Germany NaN
Specialist 3784-4667
08:46:19Z
7708-12-
Lillian 2101-6418-
2 Audiologist Lillian_Burge7970@fuliss.net 09 Egypt NaN
Burge 4041-2162
09:02:17Z
4121-06-
Cedrick 0720-6612-
3 Clerk Cedrick_Farrant4152@deons.tech 08 India NaN
Farrant 6768-4737
07:54:11Z
0893-05-
Healthcare Leslie 6625-7785-
4 Leslie_Wright2861@cispeto.com 29 Seychelles NaN
Specialist Wright 7435-0444
10:23:50Z
4529-07-
Lucy 0532-1318-
995 Webmaster Lucy_Whatson4266@zorer.org 05 Barbados NaN
Whatson 2153-2268
19:49:54Z
1296-08-
Healthcare Rose Saudi 6810-6415-
996 Rose_Kirby6758@gmail.com 27 NaN
Specialist Kirby Arabia 0575-0738
04:26:53Z
0704-08-
Insurance Logan 4171-2722-
997 Logan_Silva1028@grannar.com 08 Singapore NaN
Broker Silva 8456-1171
12:18:36Z
2871-01- Antigua
HR Aileen 7571-3307-
998 Aileen_Wise351@tonsy.org 09 and NaN
Specialist Wise 6622-5084
15:19:54Z Barbuda
1688-01-
HR Regina Equatorial 7173-6042-
999 Regina_Grey6872@ovock.tech 25 NaN
Coordinator Grey Guinea 6326-1836
06:36:18Z
0 Chadwick_Gordon5732@acrit.org 5247-4084-7638-0340
1 Gil_Lindop9421@nanoff.biz 3722-3713-3784-4667
2 Lillian_Burge7970@fuliss.net 2101-6418-4041-2162
3 Cedrick_Farrant4152@deons.tech 0720-6612-6768-4737
4 Leslie_Wright2861@cispeto.com 6625-7785-7435-0444
1126-03-
5247-4084- Chadwick
0 NaN Gabon 27 Chadwick_Gordon5732@acrit.org Cashier
7638-0340 Gordon
18:41:06Z
9490-11-
3722-3713- Healthcare
1 NaN Germany 24 Gil Lindop Gil_Lindop9421@nanoff.biz
3784-4667 Specialist
08:46:19Z
7708-12-
2101-6418- Lillian
2 NaN Egypt 09 Lillian_Burge7970@fuliss.net Audiologist
4041-2162 Burge
09:02:17Z
4121-06-
0720-6612- Cedrick
3 NaN India 08 Cedrick_Farrant4152@deons.tech Clerk
6768-4737 Farrant
07:54:11Z
0893-05-
6625-7785- Leslie Healthcare
4 NaN Seychelles 29 Leslie_Wright2861@cispeto.com
7435-0444 Wright Specialist
10:23:50Z
4529-07-
0532-1318- Lucy
995 NaN Barbados 05 Lucy_Whatson4266@zorer.org Webmaster
2153-2268 Whatson
19:49:54Z
1296-08-
6810-6415- Saudi Rose Healthcare
996 NaN 27 Rose_Kirby6758@gmail.com
0575-0738 Arabia Kirby Specialist
04:26:53Z
0704-08-
4171-2722- Logan Insurance
997 NaN Singapore 08 Logan_Silva1028@grannar.com
8456-1171 Silva Broker
12:18:36Z
Antigua 2871-01-
7571-3307- Aileen HR
998 NaN and 09 Aileen_Wise351@tonsy.org
6622-5084 Wise Specialist
Barbuda 15:19:54Z
1688-01-
7173-6042- Equatorial Regina HR
999 NaN 25 Regina_Grey6872@ovock.tech
6326-1836 Guinea Grey Coordinator
06:36:18Z