Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 10

df1= pd.

Series([2,3,4,5,6])
type(df)
df1.tolist()
df2= pd.Series([3,4,5,6,7])
df2+df1
df1-df2
df1*df2
df1/df2
To compare elements:
df1==df2
df1>df2
df1<df2
---------------------------------
to convert numpy array into pandas series:
array= np.array([1,2,3,4,5])
pf= pd.Series(array)

---------------------------------
to change the datatype of a column:
s1= pd.Series([1,2,3,4,'python'])
s2= pd.to_numeric(s1,errors='coerce')

----------------------------------
to change one column of a dataframe into a series:
df1= {"col1":[1,2,3,4,5],"col2":[2,34,56,7],"col3":[23,4,5,6]}
df1= pd.DataFrame(data=df1)
s1= df1.ix[:,0]

----------------------------------
to convert series into an numpy array:
s1= pd.Series([1,2,3,4,'python'])
nd= np.array(s1.values.tolist())

----------------------------------
to convert series of list ton one series:
s1= pd.Series([1,2,3,4],[5,6,7,8],[2,3,4])
s1= s1.apply(pd.Series).stack.reset_index(drop=True)

----------------------------------
to sort the values of a pandas series:
s1= pd.Series([1,2,3,4])
new_s1= pd.Series(s1).sort_values()

----------------------------------
to add elements into an existing pandas series:
s1= pd.Series([1,2,3,4])
new_s1= s1.append(pd.Series([45,'python']).reset_index(drop=True))

----------------------------------
to create a subset of given series based on value and condition:
s1= pd.Series([0,1,2,3,4,5,6,7])
n=5
s_new= s1[s1>n]

----------------------------------
to change the order of index:
s1= pd.Series([0,1,2,3,4,5],index=[A,B,C,D,E,F])
s1= s1.reindex(index=[B,A,D,E,F,C])
----------------------------------
to calculate mean and standard deviation of Series:
s1= pd.Series([2,3,4,5,6])
mean= s1.mean()
st_dev= s1.std()

----------------------------------
to get the items of series which are not in another series
s1= pd.Series([1,2,3,4,5,6])
s2= pd.Series([2,4,6,8,10,12])
print("Element of s1 which are not in s2")
new_s1= s1[~s1.isin(s2)]

----------------------------------
To get the items which are not common in both the series:
s1= pd.Series([1,2,3,4,5,6])
s2=pd.Series([2,4,6,8,10,12])
print('Elements which are not in common')
s11= pd.Series(np.union1d(s1,s2))
s22= pd.Series(np.intersection(s1,s2))
uncommon_elements= s11[~s11.isin[s22]]

----------------------------------
To compute minimum 25th 50th 75th and maximum values of a series:
s1= np.random.RandomState(100)
num_series= pd.Series(s1.normal(10,4,20)) -> mean=10,std_dev=4, total values= 20
result= np.percentile(num_series,q=[0,25,50,75,100])
print(result)

----------------------------------
To get the frequency count of each unique value:
s1= pd.Series([2,3,2,4,5,6,4,2,2,3,4,3,2])
result= s1.value_counts()

----------------------------------
Most frequent occur and replace other elements with 'Other'
s1= pd.Series(np.random.randint(1,5,[15]))
most_frequent= s1[~s1.isin(s1.value_counts().index[:1])]= 'Other'

----------------------------------
Print the position of number from series which are multiple of 5:
s1= pd.Series([1,2,3,5,10,15,30])
index= np.where(s1%5==0)
print(index)

----------------------------------
to extract the item at given position:
s1= pd.Series(list('23456789087633235'))
pos= [0,1,2,3,5,6]

extracted_item= s1.take(pos)

----------------------------------
to get the position of element from a given series to another series:
s1= pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
s2= pd.Series([1, 3, 5, 7, 10])
result= [pd.Index(s1).get_loc(i) for i in s2]

----------------------------------
to make first and last letter of every word in series in upper case:
series1= pd.Series(['php',''python'])
new_series= series1.map(lambda x: x[0].upper() + x[1:-1] + x[-1].upper())

----------------------------------
To calculate len of characters in each word:
series1= pd.Series(['php','python'])
new_= series1.map(lambda x: len(x))
print(new_)

----------------------------------
Difference between two consecutive number:
s1= pd.Series([1,2,3,4,5,6,7])
result= s1.diff().tolist()

-----------------------------------
to convert dates into timeseries:
df= pd.Series(['01 Jan 2015', '10-02-2016', '20180307', '2014/05/06', '2016-04-12',
'2019-04-06T11:20'])
new_df= pd.to_datetime(df)

-----------------------------------
To get the date,month,year,week number, day of week from date:
from dateutil.parser import parser

date_sereies= pd.Series(['01 Jan 2015', '10-02-2016', '20180307', '2014/05/06',


'2016-04-12', '2019-04-06T11:20'])
date_series= date_series.map(lambda x: parser(x))
print("Day of month:")
print(date_series.dt.day.tolist())
print("Day of year:")
print(date_series.dt.dayofyear.tolist())
print("Week number:")
print(date_series.dt.isocalendar()) -> to get the week number
print(date_series.dt.weekday.tolist())

------------------------------------
To get the count of words which has number of vowels greater than or equal to 2:
series= pd.Series(['Red', 'Green', 'Orange', 'Pink', 'Yellow', 'White'])
series= series.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in
list(['aeiou'])])>=2)

------------------------------------
to get the euclidian distance:
x= pd.Series([])
y= pd.Series([])
distance= np.linalg.norm(x-y)

------------------------------------
Replace white space with least frequent character in the string
s= 'abc def abcdef icd'
s_series= pd.Series(list(s))
element_counts= s_series.value_counts()
current_freq= element_counts.dropna().index[-1]
result= "".join(s.replace(" ",current_freq))

-------------------------------------
Autocorrelation, also known as serial correlation, is the correlation of a signal
with a delayed copy of itself as a function of delay.
Informally, it is the similarity between observations as a function of the time lag
between them.
num_series = pd.Series(np.arange(15) + np.random.normal(1, 10, 15))
autocorrelations = [num_series.autocorr(i).round(2) for i in range(11)]

-------------------------------------
Create a time series to display Sunday of entire year:
result = pd.Series(pd.date_range('2022-01-01',period=52,freq='W-SUN'))

--------------------------------------
To convert a series into Dataframe and taking index as another column
char_list = list('ABCDEFGHIJKLMNOP')
num_arra = np.arange(8)
num_dict = dict(zip(char_list, num_arra))
num_ser = pd.Series(num_dict)
df= num_ser.to_frame().reset_index()

---------------------------------------
To add two series vertically and horizontally:
Vertically:
series1.append(series2)
series_horizontal= pd.concat([series1,series2],axis=1)

---------------------------------------
To get max and min of Series:
num_series= pd.Series([1,2,3,4,5,5,6,7])
min_num= num_series.idxmin()
max_num= num_series.idxmax()

---------------------------------------
to get basic information about dataframe:
df= pd.DataFrame([])
df.info()

---------------------------------------
To get the first three rows:
df.iloc[:3]

---------------------------------------
To select two specified problem:
df[[name,score]]

---------------------------------------
To select specified rows and column from a dataframe:
df.iloc[[1,2,3,4],[1,3]]

---------------------------------------
To get rows where attempts is greater than two:
df[df['attempts']>2]

---------------------------------------
To get total number of rows and column:
total_rows=df.axes[0]
total_cols= df.axes[1]

----------------------------------------
To get the rows where the data is missing:
df[df[score].isnull()]
----------------------------------------
select the rows where attempts is less than 2 and score greater than 15:
df[(df['attempts']<2) & (df['score']>15)]

----------------------------------------
To select rows where the score is between 15 and 20 inclusive
df[df[score].between(15,20)]

----------------------------------------
To change score in d row to score=15
df.loc[d,'score']=15

----------------------------------------
Sum of examination attempt by students:
df['attempts'].sum()

----------------------------------------
to calculate the mean score for each different student:
df['Score'].mean()

----------------------------------------
To add a new row at k with given values:
df.loc['k']=[1,'Suresh','yes',15.5]

----------------------------------------
To drop the row by it's location:
df.drop('k')

----------------------------------------
Sort the column name by descending order and score by ascending order:
df.sort_values(by=['name','score'],ascending=[False,True])

----------------------------------------
Replace the qualify column yes with True and No with False:
df['qualify'].map({yes: True,No: False})

----------------------------------------
To change the name in names column from James to Suresh
df['names']= df['names'].replace('James','Suresh')

----------------------------------------
To delete the column attempts
df.pop('attempts')

----------------------------------------
To insert a new column in existing dataframe
color=['Blue','Green','White','Red']
df['color']=color

----------------------------------------
To iterate over rows in dataframe:
for index,row in df.iterrows():

----------------------------------------
To get the column names of dataframe
df.columns.values

----------------------------------------
To rename the column:
df= df.rename(columns={col1:column1,col2:column2,col3:column3})

----------------------------------------
To select rows based on some value from columns:
df.loc[df['col']==4]

----------------------------------------
To change the position of the rows:
df= df[['col3','col1','col2']]

----------------------------------------
To add row in existing dataframe:
df2= {'col1':2,'col2':'Suresh','col2':15.5}
df1.append(df2,ignore_index=True)

----------------------------------------
To save the file as CSV with \t as seperator
df1.to_csv('data.csv',sep='\t',index=False)

----------------------------------------
Count of people city wise:
count= df.groupby(['city']).size().reset_index(name='No of people')

----------------------------------------
To delete rows with a given value or condition
df= df[df.cols!=5]

----------------------------------------
To select the rows by integer index:
df1= df1.iloc[[2]]

----------------------------------------
To replace all the NaN value with 0
df= df.fillna(0)

----------------------------------------
To convert index in a column in a data frame
df.reset_index(level=0,in_place=True)

To hide the index column


df.to_string(index=False)

----------------------------------------
Set a particular value using index value

df.set_value(8, 'score', 12.0)


df.set_value(index_row, column_name, value)

----------------------------------------
To count number of null values in one or more column
df.isnull().values.sum()

-----------------------------------------
To drop a list of rows using index from dataframe:
df= df.drop(df.index[[1,2,3,4]])

-----------------------------------------
To drop rows by position (drop first two rows)
df= df.drop([1,2])
-----------------------------------------
To reset index in a dataframe
df= df.reset_index()

-----------------------------------------
to divide the data frame
df= pd.DataFrame([])
part70= df.sample(frac=0.7,random_state=10)
part30= df.drop(part70.index)

------------------------------------------
To concatenate two series into a Dataframe

s1= pd.Series([])
s2= pd.Series

df= pd.concat([s1,s2],axis=1)

-------------------------------------------
convert from string date time to data frame column
s= pd.Series([])
r= pd.to_datetime(s)
df= pd.DataFrame(r)

-------------------------------------------
To get list of specified column

col2= df['col2'].tolist()

-------------------------------------------
Find the row number where the value of the column is maximum

mx_index_col1= df['col1'].argmax()

-------------------------------------------
To check if the column is present in dataframe
if 'col1' is in df.columns:

-------------------------------------------
To get the row value at specified index
df.iloc[3] -> return the values present at index 3

-------------------------------------------
To get the specified data types of the column:
df.dtypes

-------------------------------------------
To add the data into empty data frame:

data= pd.DataFrame({'col1': value1,


'col2': value2,
'col3': value3})

df= pd.DataFrame()
df.append(data)

-------------------------------------------
Sort the data frame by two or more columns
df.sort_values(by=['score','name'],ascending=[False,True])

-------------------------------------------
To convert the data type float to int

df.score.astype(int)

-------------------------------------------
To replace infinity to NaN

df= df.replace([np.inf, -np.inf],np.NaN)

-------------------------------------------
To add new column at specified index

index=0
col1= [1,2,3,4,5]
df.insert(loc=index,column='col1',value=col1)

-------------------------------------------
To convert the list of list into a dataframe

my_list= [['col1','col2'],[1,2],[3,4]]
headers= my_list.pop(0)
df= pd.DataFrame(my_list,columns= headers)

-------------------------------------------
To group the dataframe by column1 and get the column 2 values as list

df=df.groupby('col1')['col2'].apply(list)

--------------------------------------------
get the index of column
df.column.get_loc('col1')

--------------------------------------------
To count the number of columns
len(df.columns)

--------------------------------------------
To select all columns except one column
df= df.loc[:, df.columns!='col3']

--------------------------------------------
To get first n records:
df.head(n)

--------------------------------------------
To get last n records:
df.tail(n)

--------------------------------------------
to get topmost number from column

df.nlargest(3,'col1')

--------------------------------------------
to get rows after removing first n rows
df1= df.iloc[3:]

--------------------------------------------
To get rows after removing last n rows :
df1= df.iloc[:3]

--------------------------------------------
To add prefix and suffix in column name

df.add_prefix("A_")
df.add_suffix("_A")

-------------------------------------------
To select columns by datatype

df= df.select_dtypes(include='object')

-------------------------------------------
To divide values in different subset

df1= df.sample(frac=0.6)
df2= df.drop(df1.index)

-------------------------------------------
To convert continous value column into a categorical column
df['Age_group']= df.cut(df['age'],bins=[0,18,25,35],labels=['kids','adults','old'])

-------------------------------------------
To use local variable in query
maxx= df['col'].max()

df= df.query("col< @maxx")

-------------------------------------------
to get index and distinct value of column

labels, names= pd.factorize(df['name'])

-------------------------------------------
To read data from excel sheet

df= pd.read_clipboard()
-------------------------------------------

df1= pd.DataFrame()
df2= pd.DataFrame()

df1.ne(df2) --> check for inequality -> if unequal return true

-------------------------------------------
To set the index :

df= pd.DataFrame()

df.set_index('col_name')

-------------------------------------------
To remove the index and make the index as default
df1= df.reset_index(inplace=False)

You might also like