Python-cheatsheets Merged 230118 192222

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 22

> Model Architecture > Inspect Model

Sequential Model
Python For Data Science

>>> model.output_shape #Model output shape

>>> model.summary() #Model summary representation

>>> model.get_config() #Model configuration

>>> from tensorflow.keras.models import Sequential


>>> model.get_weights() #List all weight tensors in the model

Keras Cheat Sheet


>>> model = Sequential()

>>> model2 = Sequential()

>>> model3 = Sequential()

Multilayer Perceptron (MLP)


> Compile Model
Learn Keras online at www.DataCamp.com
MLP: Binary Classification
Binary Classification
>>> model.compile(optimizer='adam',

>>> from tensorflow.keras.layers import Dense


loss='binary_crossentropy',

>>> model.add(Dense(12,
metrics=['accuracy'])
input_dim=8,

MLP: Multi-Class Classification

Keras
kernel_initializer='uniform',

activation='relu'))
>>> model.compile(optimizer='rmsprop',

>>> model.add(Dense(8,kernel_initializer='uniform',activation='relu'))
loss='categorical_crossentropy',

>>> model.add(Dense(1,kernel_initializer='uniform',activation='sigmoid')) metrics=['accuracy'])


Keras is a powerful and easy-to-use deep learning library forTheano and Multi-Class Classification MLP: Regression

TensorFlow that provides a high-level neural networks API to develop and >>> from tensorflow.keras.layers import Dropout
>>> model.compile(optimizer='rmsprop',

>>> model.add(Dense(512,activation='relu',input_shape=(784,)))
loss='mse',

evaluate deep learning models. >>> model.add(Dropout(0.2))


metrics=['mae'])
>>> model.add(Dense(512,activation='relu'))

A Basic Example Recurrent Neural Network


>>> model.add(Dropout(0.2))

>>> import numpy as np


>>> model.add(Dense(10,activation='softmax')) >>> model3.compile(loss='binary_crossentropy',

>>> from tensorflow.keras.models import Sequential


optimizer='adam',

Regression metrics=['accuracy'])
>>> from tensorflow.keras.layers import Dense

>>> data = np.random.random((1000,100))


>>> model.add(Dense(64,activation='relu',input_dim=train_data.shape[1]))

>>> labels = np.random.randint(2,size=(1000,1))


>>> model.add(Dense(1))
>>> model = Sequential()

>>> model.add(Dense(32,

activation='relu',
Convolutional Neural Network (CNN) > Model Training
input_dim=100))

>>> model.add(Dense(1, activation='sigmoid'))


model3.fit(x_train4,

>>> from tensorflow.keras.layers import Activation,Conv2D,MaxPooling2D,Flatten


>>>
>>> model.compile(optimizer='rmsprop',
y_train4,

>>> model2.add(Conv2D(32,(3,3),padding='same',input_shape=x_train.shape[1:]))

loss='binary_crossentropy',
batch_size=32,

>>> model2.add(Activation('relu'))

metrics=['accuracy'])
epochs=15,

>>> model2.add(Conv2D(32,(3,3)))

>>> model.fit(data,labels,epochs=10,batch_size=32)
verbose=1,

>>> model2.add(Activation('relu'))

>>> predictions = model.predict(data) validation_data=(x_test4,y_test4))


>>> model2.add(MaxPooling2D(pool_size=(2,2)))

>>> model2.add(Dropout(0.25))

>>> model2.add(Conv2D(64,(3,3), padding='same'))

> Data
>>> model2.add(Activation('relu'))

>>>
>>>
model2.add(Conv2D(64,(3, 3)))

model2.add(Activation('relu'))
> Evaluate Your Model's Performance
>>> model2.add(MaxPooling2D(pool_size=(2,2)))

Your data needs to be stored as NumPy arrays or as a list of NumPy arrays. Ideally, you split the data in training and >>> model2.add(Dropout(0.25))
>>> score = model3.evaluate(x_test,

test sets, for which you can also resort to the train_test_split module of sklearn.cross_validation. >>> model2.add(Flatten())
y_test,

>>> model2.add(Dense(512))
batch_size=32)
>>> model2.add(Activation('relu'))

Keras Data Sets >>> model2.add(Dropout(0.5))

>>> model2.add(Dense(num_classes))

>>> from tensorflow.keras.datasets import boston_housing, mnist, cifar10, imdb

>>> (x_train,y_train),(x_test,y_test) = mnist.load_data()

>>> model2.add(Activation('softmax'))
> Save/ Reload Models
>>> (x_train2,y_train2),(x_test2,y_test2) = boston_housing.load_data()

>>> (x_train3,y_train3),(x_test3,y_test3) = cifar10.load_data()

Recurrent Neural Network (RNN) >>> from tensorflow.keras.models import load_model

>>> (x_train4,y_train4),(x_test4,y_test4) = imdb.load_data(num_words=20000)


>>> model3.save('model_file.h5')

>>> num_classes = 10 >>> from tensorflow.keras.klayers import Embedding,LSTM


>>> my_model = load_model('my_model.h5')
>>> model3.add(Embedding(20000,128))

>>> model3.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))

Other >>> model3.add(Dense(1,activation='sigmoid'))

>>> from urllib.request import urlopen


> Model Fine-tuning
> Prediction
>>> data =
np.loadtxt(urlopen("http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-di
abetes/pima-indians-diabetes.data"),delimiter=",")
Optimization Parameters
>>> X = data[:,0:8]

>>> y = data [:,8] >>> model3.predict(x_test4, batch_size=32)


>>> from tensorflow.keras.optimizers import RMSprop

>>> model3.predict_classes(x_test4,batch_size=32) >>> opt = RMSprop(lr=0.0001, decay=1e-6)

>>> model2.compile(loss='categorical_crossentropy',

optimizer=opt,

> Preprocessing Also see NumPy & Scikit-Learn metrics=['accuracy'])

Early Stopping
Sequence Padding Train and Test Sets
>>> from tensorflow.keras.callbacks import EarlyStopping

>>> from tensorflow.keras.preprocessing import sequence


>>> from sklearn.model_selection import train_test_split
>>> early_stopping_monitor = EarlyStopping(patience=2)

>>> x_train4 = sequence.pad_sequences(x_train4,maxlen=80)


>>> X_train5,X_test5,y_train5,y_test5 = train_test_split(X, y,
>>> model3.fit(x_train4,

>>> x_test4 = sequence.pad_sequences(x_test4,maxlen=80) test_size=0.33,


y_train4,

random_state=42) batch_size=32,

epochs=15,

One-Hot Encoding validation_data=(x_test4,y_test4),

Standardization/Normalization callbacks=[early_stopping_monitor])

>>> from tensorflow.keras.utils import to_categorical

>>> Y_train = to_categorical(y_train, num_classes)


>>> from sklearn.preprocessing import StandardScaler

>>> Y_test = to_categorical(y_test, num_classes)


>>> scaler = StandardScaler().fit(x_train2)

Y_train3 = to_categorical(y_train3, num_classes)


standardized_X = scaler.transform(x_train2)

Learn Data Skills Online at www.DataCamp.com


>>> >>>
>>> Y_test3 = to_categorical(y_test3, num_classes) >>> standardized_X_test = scaler.transform(x_test2)
> Inspecting Your Array > Sorting Arrays
Python For Data Science

>>> a.shape #Array dimensions


>>> a.sort() #Sort an array

>>> len(a) #Length of array


>>> c.sort(axis=0) #Sort the elements of an array's axis
>>> b.ndim #Number of array dimensions

NumPy Cheat Sheet


>>> e.size #Number of array elements

>>> b.dtype #Data type of array elements

>>>
>>>
b.dtype.name #Name of data type

b.astype(int) #Convert an array to a different type > Subsetting, Slicing, Indexing


Learn NumPy online at www.DataCamp.com Subsetting

> Data Types >>> a[2] #Select the element at the 2nd index

>>> b[1,2] #Select the element at row 1 column 2 (equivalent to b[1][2])

1.5 2
2 3

3
6.0 4 5 6
>>> np.int64 #Signed 64-bit integer types

Numpy
>>> np.float32 #Standard double-precision floating point
Slicing
>>> np.complex #Complex numbers represented by 128 floats
>>> a[0:2] #Select items at index 0 and 1
1 2 3
>>> Numpy
np.bool #Boolean type storing TRUE and FALSE values
array([1, 2])

>>> np.object #Python object type


>>> b[0:2,1] #Select items at rows 0 and 1 in column 1
1.5 2 3
The NumPy library is the core library for scientific computing in Python.
>>>
>>>
np.string_ #Fixed-length string type

np.unicode_ #Fixed-length unicode type


array([ 2., 5.])
4 5 6
>>> b[:1] #Select all items at row 0 (equivalent to b[0:1, :])

It provides a high-performance multidimensional array object, and tools for array([[1.5, 2., 3.]])

1.5 2 3
4 5 6
working with these arrays >>> c[1,...] #Same as [1,:,:]

> Array Mathematics


array([[[ 3., 2., 1.],

Use the following import convention: [ 4., 5., 6.]]])

>>> a[ : :-1] #Reversed array a array([3, 2, 1])


>>> import numpy as np
Boolean Indexing
Arithmetic Operations >>> a[a<2] #Select elements from a less than 2
1 2 3
NumPy Arrays array([1])

>>> g = a - b #Subtraction
Fancy Indexing
array([[-0.5, 0. , 0. ],

>>> b[[1, 0, 1, 0],[0, 1, 2, 0]] #Select elements (1,0),(0,1),(1,2) and (0,0)

[-3. , -3. , -3. ]])

array([ 4. , 2. , 6. , 1.5])

>>> np.subtract(a,b) #Subtraction

>>> b[[1, 0, 1, 0]][:,[0,1,2,0]] #Select a subset of the matrix’s rows and columns

>>> b + a #Addition

array([[ 4. ,5. , 6. , 4. ],

array([[ 2.5, 4. , 6. ],

[ 1.5, 2. , 3. , 1.5],

[ 5. , 7. , 9. ]])

[ 4. , 5. , 6. , 4. ],

>>> np.add(b,a) Addition

[ 1.5, 2. , 3. , 1.5]])
>>> a / b #Division

array([[ 0.66666667, 1. , 1. ],

[ 0.25 , 0.4 , 0.5 ]])

>>> np.divide(a,b) #Division

>>> a * b #Multiplication
> Array Manipulation
> Creating Arrays
array([[ 1.5, 4. , 9. ],

[ 4. , 10. , 18. ]])

>>> np.multiply(a,b) #Multiplication


Transposing Array
>>> np.exp(b) #Exponentiation
>>> i = np.transpose(b) #Permute array dimensions

>>> a = np.array([1,2,3])
>>> np.sqrt(b) #Square root
>>> i.T #Permute array dimensions
>>> b = np.array([(1.5,2,3), (4,5,6)], dtype = float)
>>> np.sin(a) #Print sines of an array

>>> c = np.array([[(1.5,2,3), (4,5,6)],[(3,2,1), (4,5,6)]], dtype = float) >>> np.cos(b) #Element-wise cosine
Changing Array Shape
>>> np.log(a) #Element-wise natural logarithm
>>> b.ravel() #Flatten the array

>>> e.dot(f) #Dot product


>>> g.reshape(3,-2) #Reshape, but don’t change data
Initial Placeholders array([[ 7., 7.],

[ 7., 7.]]) Adding/Removing Elements


>>> h.resize((2,6)) #Return a new array with shape (2,6)

>>> np.zeros((3,4)) #Create an array of zeros


>>> np.append(h,g) #Append items to an array

>>> np.ones((2,3,4),dtype=np.int16) #Create an array of ones


Comparison >>> np.insert(a, 1, 5) #Insert items in an array

>>> d = np.arange(10,25,5) #Create an array of evenly spaced values (step value)


>>> np.delete(a,[1]) #Delete items from an array
>>> np.linspace(0,2,9) #Create an array of evenly spaced values (number of samples)

>>> e = np.full((2,2),7) #Create a constant array


>>> a == b #Element-wise comparison
Combining Arrays
>>> f = np.eye(2) #Create a 2X2 identity matrix
array([[False, True, True],
>>> np.concatenate((a,d),axis=0) #Concatenate arrays

>>> np.random.random((2,2)) #Create an array with random values


[False, False, False]], dtype=bool)
array([ 1, 2, 3, 10, 15, 20])

>>> np.empty((3,2)) #Create an empty array >>> a < 2 #Element-wise comparison


>>> np.vstack((a,b)) #Stack arrays vertically (row-wise)

array([True, False, False], dtype=bool)


array([[ 1. , 2. , 3. ],

>>> np.array_equal(a, b) #Array-wise comparison [ 1.5, 2. , 3. ],

[ 4. , 5. , 6. ]])

> I/O Aggregate Functions


>>> np.r_[e,f] #Stack arrays vertically (row-wise)

>>> np.hstack((e,f)) #Stack arrays horizontally (column-wise)

array([[ 7., 7., 1., 0.],

[ 7., 7., 0., 1.]])

Saving & Loading On Disk >>> a.sum() #Array-wise sum


>>> np.column_stack((a,d)) #Create stacked column-wise arrays

>>> a.min() #Array-wise minimum value


array([[ 1, 10],

>>> b.max(axis=0) #Maximum value of an array row


[ 2, 15],

>>> np.save('my_array', a)
>>> b.cumsum(axis=1) #Cumulative sum of the elements
[ 3, 20]])

>>> np.savez('array.npz', a, b)
>>> a.mean() #Mean
>>> np.c_[a,d] #Create stacked column-wise arrays
>>> np.load('my_array.npy') >>> np.median(b) #Median

>>> np.corrcoef(a) #Correlation coefficient


Splitting Arrays
>>> np.std(b) #Standard deviation >>> np.hsplit(a,3) #Split the array horizontally at the 3rd index

Saving & Loading Text Files [array([1]),array([2]),array([3])]

>>> np.vsplit(c,2) #Split the array vertically at the 2nd index

[array([[[ 1.5, 2. , 1. ],

>>> np.loadtxt("myfile.txt")

>>> np.genfromtxt("my_file.csv", delimiter=',')


> Copying Arrays [ 4. , 5. , 6. ]]]),

array([[[ 3., 2., 3.],

>>> np.savetxt("myarray.txt", a, delimiter=" ") [ 4., 5., 6.]]])]


>>> h = a.view() #Create a view of the array with the same data

>>> np.copy(a) #Create a copy of the array

> Asking For Help


>>> h = a.copy() #Create a deep copy of the array

Learn Data Skills Online at www.DataCamp.com


>>> np.info(np.ndarray.dtype)
> Advanced Indexing Also see NumPy Arrays > Combining Data
Python For Data Science
Selecting
>>> df3.loc[:,(df3>1).any()] #Select cols with any vals >1

Data Wrangling in Pandas Cheat Sheet >>>


>>>
>>>
df3.loc[:,(df3>1).all()] #Select cols with vals > 1

df3.loc[:,df3.isnull().any()] #Select cols with NaN

df3.loc[:,df3.notnull().all()] #Select cols without NaN

Learn Data Wrangling online at www.DataCamp.com Indexing With isin()


>>> df[(df.Country.isin(df2.Type))] #Find same elements

>>> df3.filter(items=”a”,”b”]) #Filter on values


Merge
>>> df.select(lambda x: not x%5) #Select specific elements

Where >>> pd.merge(data1,

data2,

> Reshaping Data >>> s.where(s > 0) #Subset the data

Query
how='left',

on='X1')

>>> df6.query('second > first') #Query DataFrame


Pivot >>> pd.merge(data1,

data2,

>>> df3= df2.pivot(index='Date', #Spread rows into columns


Setting/Resetting Index how='right',

on='X1')
columns='Type',

values='Value') >>> df.set_index('Country') #Set the index

>>> df4 = df.reset_index() #Reset the index


>>> pd.merge(data1,

>>> df = df.rename(index=str, #Rename


data2,

DataFrame columns={"Country":"cntry",
how='inner',

"Capital":"cptl",
on='X1')
"Population":"ppltn"})
>>> pd.merge(data1,

Reindexing data2,

how='outer',

on='X1')
Pivot Table >>> s2 = s.reindex(['a','c','d','e','b'])

Forward Filling Backward Filling


>>> df4 = pd.pivot_table(df2, #Spread rows into

columns values='Value',
>>> df.reindex(range(4),
>>> s3 = s.reindex(range(5),

index='Date',
method='ffill') method='bfill') Join
columns='Type']) Country Capital Population
0 3

0 Belgium Brussels 11190846


1 3
>>> data1.join(data2, how='right')
1 India New Delhi 1303171035
2 3

Stack / Unstack 2 Brazil Brasília 207847528


3 3

3 Brazil Brasília 207847528 4 3


Concatenate
>>> stacked = df5.stack() #Pivot a level of column labels

>>> stacked.unstack() #Pivot a level of index labels


MultiIndexing Vertical
>>> s.append(s2)
>>> arrays = [np.array([1,2,3]),

np.array([5,4,3])]
Horizontal/Vertical
>>> df5 = pd.DataFrame(np.random.rand(3, 2), index=arrays)

>>> pd.concat([s,s2],axis=1, keys=['One','Two'])

>>> tuples = list(zip(*arrays))

>>> pd.concat([data1, data2], axis=1, join='inner')


>>> index = pd.MultiIndex.from_tuples(tuples,

names=['first', 'second'])

>>> df6 = pd.DataFrame(np.random.rand(3, 2), index=index)

Melt >>> df2.set_index(["Date", "Type"])

> Dates
> Duplicate Data
>>> pd.melt(df2, #Gather columns into rows

id_vars=["Date"],

value_vars=["Type", "Value"],
>>> df2['Date']= pd.to_datetime(df2['Date'])

value_name="Observations") >>> df2['Date']= pd.date_range('2000-1-1',

>>> s3.unique() #Return unique values


periods=6,

>>> df2.duplicated('Type') #Check duplicates


freq='M')

>>> df2.drop_duplicates('Type', keep='last') #Drop duplicates


>>> dates = [datetime(2012,5,1), datetime(2012,5,2)]

>>> df.index.duplicated() #Check index duplicates >>> index = pd.DatetimeIndex(dates)

>>> index = pd.date_range(datetime(2012,2,1), end, freq='BM')

> Grouping Data


> Visualization Also see Matplotlib
Aggregation
> Iteration >>> df2.groupby(by=['Date','Type']).mean()

>>> import matplotlib.pyplot as plt


>>> s.plot()
>>> df2.plot()

>>> df4.groupby(level=0).sum()

>>> df4.groupby(level=0).agg({'a':lambda x:sum(x)/len(x), 'b': np.sum}) >>> plt.show() >>> plt.show()


>>> df.iteritems() #(Column-index, Series) pairs

>>> df.iterrows() #(Row-index, Series) pairs


Transformation
>>> customSum = lambda x: (x+x%2)

> Missing Data


>>> df4.groupby(level=0).transform(customSum)

>>> df.dropna() #Drop NaN values

>>> df3.fillna(df3.mean()) #Fill NaN values with a predetermined value


Learn Data Skills Online at www.DataCamp.com
>>> df2.replace("a", "f") #Replace values with others
Tidy Data – A foundation for wrangling in pandas
Data Wrangling
& *
Tidy data complements pandas’s vectorized
with pandas Cheat Sheet In a tidy operations. pandas will automatically preserve
http://pandas.pydata.org data set: observations as you manipulate variables. No
other format works as intuitively with pandas.
M A
Pandas API Reference Pandas User Guide Each variable is saved
in its own column
Each observation is
saved in its own row *
Creating DataFrames Reshaping Data – Change layout, sorting, reindexing, renaming
a b c df.sort_values('mpg')
1 4 7 10 Order rows by values of a column (low to high).
2 5 8 11
df.sort_values('mpg’, ascending=False)
3 6 9 12
Order rows by values of a column (high to low).
df = pd.DataFrame(
{"a" : [4, 5, 6], pd.melt(df) df.pivot(columns='var', values='val') df.rename(columns = {'y':'year'})
Gather columns into rows. Spread rows into columns. Rename the columns of a DataFrame
"b" : [7, 8, 9],
"c" : [10, 11, 12]}, df.sort_index()
index = [1, 2, 3]) Sort the index of a DataFrame
Specify values for each column.
df.reset_index()
df = pd.DataFrame( Reset index of DataFrame to row numbers, moving
[[4, 7, 10], index to columns.
[5, 8, 11], pd.concat([df1,df2], axis=1)
pd.concat([df1,df2]) df.drop(columns=['Length’, 'Height'])
[6, 9, 12]], Append columns of DataFrames
Append rows of DataFrames Drop columns from DataFrame
index=[1, 2, 3],
columns=['a', 'b', 'c'])
Specify values for each row. Subset Observations - rows Subset Variables - columns Subsets - rows and columns
a b c Use df.loc[] and df.iloc[] to select only
N v rows, only columns or both.
1 4 7 10 Use df.at[] and df.iat[] to access a single
D
2 5 8 11
df[df.Length > 7] df[['width’, 'length’, 'species']] value by row and column.
Extract rows that meet logical criteria. Select multiple columns with specific names. First index selects rows, second index columns.
e 2 6 9 12
df.drop_duplicates() df['width'] or df.width
df.iloc[10:20]
df = pd.DataFrame( Remove duplicate rows (only considers columns). Select single column with specific name.
Select rows 10-20.
{"a" : [4 ,5, 6], df.sample(frac=0.5) df.filter(regex='regex')
df.iloc[:, [1, 2, 5]]
"b" : [7, 8, 9], Randomly select fraction of rows. Select columns whose name matches
Select columns in positions 1, 2 and 5 (first
"c" : [10, 11, 12]}, df.sample(n=10) Randomly select n rows. regular expression regex.
column is 0).
index = pd.MultiIndex.from_tuples( df.nlargest(n, 'value’)
df.loc[:, 'x2':'x4']
[('d’, 1), ('d’, 2), Select and order top n entries. Using query Select all columns between x2 and x4 (inclusive).
('e’, 2)], names=['n’, 'v'])) df.nsmallest(n, 'value')
query() allows Boolean expressions for filtering df.loc[df['a'] > 10, ['a’, 'c']]
Create DataFrame with a MultiIndex Select and order bottom n entries.
rows. Select rows meeting logical condition, and only
df.head(n)
df.query('Length > 7') the specific columns .
Select first n rows.
Method Chaining df.tail(n)
Select last n rows.
df.query('Length > 7 and Width < 8')
df.query('Name.str.startswith("abc")',
df.iat[1, 2] Access single value by index
df.at[4, 'A'] Access single value by label
Most pandas methods return a DataFrame so that engine="python")
another pandas method can be applied to the result. Logic in Python (and pandas) regex (Regular Expressions) Examples
This improves readability of code.
< Less than != Not equal to '\.' Matches strings containing a period '.'
df = (pd.melt(df)
.rename(columns={ > Greater than df.column.isin(values) Group membership 'Length$' Matches strings ending with word 'Length'

'variable':'var', == Equals pd.isnull(obj) Is NaN '^Sepal' Matches strings beginning with the word 'Sepal'
'value':'val'}) <= Less than or equals pd.notnull(obj) Is not NaN '^x[1-5]$' Matches strings beginning with 'x' and ending with 1,2,3,4,5
.query('val >= 200')
>= Greater than or equals &,|,~,^,df.any(),df.all() Logical and, or, not, xor, any, all '^(?!Species$).*' Matches strings except the string 'Species'
)
Cheatsheet for pandas (http://pandas.pydata.org/ originally written by Irv Lustig, Princeton Consultants, inspired by Rstudio Data Wrangling Cheatsheet
Summarize Data Handling Missing Data Combine Data Sets
df['w'].value_counts() df.dropna() adf bdf
Count number of rows with each unique value of variable Drop rows with any column having NA/null data. x1 x2 x1 x3
len(df) df.fillna(value) A 1 A T
# of rows in DataFrame. Replace all NA/null data with value. B 2 B F
df.shape C 3 D T
Tuple of # of rows, # of columns in DataFrame.
df['w'].nunique()
Make New Columns Standard Joins

# of distinct values in a column. x1 x2 x3 pd.merge(adf, bdf,


df.describe() A 1 T how='left', on='x1')
Basic descriptive and statistics for each column (or GroupBy). B 2 F Join matching rows from bdf to adf.
C 3 NaN
df.assign(Area=lambda df: df.Length*df.Height)
Compute and append one or more new columns. x1 x2 x3 pd.merge(adf, bdf,
df['Volume'] = df.Length*df.Height*df.Depth A 1.0 T how='right', on='x1')
pandas provides a large set of summary functions that operate on B 2.0 F
Add single column. Join matching rows from adf to bdf.
different kinds of pandas objects (DataFrame columns, Series, D NaN T
pd.qcut(df.col, n, labels=False)
GroupBy, Expanding and Rolling (see below)) and produce single
Bin column into n buckets.
values for each of the groups. When applied to a DataFrame, the x1 x2 x3 pd.merge(adf, bdf,
result is returned as a pandas Series for each column. Examples: A 1 T how='inner', on='x1')
sum() min() Vector Vector B 2 F Join data. Retain only rows in both sets.
function function
Sum values of each object. Minimum value in each object.
count() max() x1 x2 x3 pd.merge(adf, bdf,
Count non-NA/null values of Maximum value in each object. pandas provides a large set of vector functions that operate on all A 1 T how='outer', on='x1')
each object. mean() columns of a DataFrame or a single selected column (a pandas B 2 F Join data. Retain all values, all rows.
median() Mean value of each object. Series). These functions produce vectors of values for each of the C 3 NaN
Median value of each object. var() columns, or a single Series for the individual Series. Examples: D NaN T
quantile([0.25,0.75]) Variance of each object.
max(axis=1) min(axis=1) Filtering Joins
Quantiles of each object. std()
Element-wise max. Element-wise min. x1 x2 adf[adf.x1.isin(bdf.x1)]
apply(function) Standard deviation of each
clip(lower=-10,upper=10) abs() A 1 All rows in adf that have a match in bdf.
Apply function to each object. object.
Trim values at input thresholds Absolute value. B 2
Group Data x1 x2 adf[~adf.x1.isin(bdf.x1)]
The examples below can also be applied to groups. In this case, the C 3 All rows in adf that do not have a match in bdf.
df.groupby(by="col")
Return a GroupBy object, grouped function is applied on a per-group basis, and the returned vectors
by values in column named "col". are of the length of the original DataFrame. ydf zdf
shift(1) x1 x2 x1 x2
shift(-1)
df.groupby(level="ind") Copy with values shifted by 1. A 1 B 2
Copy with values lagged by 1.
Return a GroupBy object, grouped rank(method='dense') B 2 C 3
cumsum()
by values in index level named Ranks with no gaps. C 3 D 4
Cumulative sum.
"ind". rank(method='min') cummax() Set-like Operations
Ranks. Ties get min rank. Cumulative max. x1 x2 pd.merge(ydf, zdf)
All of the summary functions listed above can be applied to a group. rank(pct=True) cummin() B 2 Rows that appear in both ydf and zdf
Additional GroupBy functions: Ranks rescaled to interval [0, 1]. Cumulative min. C 3 (Intersection).
size() agg(function) rank(method='first') cumprod()
Size of each group. Aggregate group using function. Ranks. Ties go to first value. Cumulative product. x1 x2 pd.merge(ydf, zdf, how='outer')
A 1 Rows that appear in either or both ydf and zdf
Windows Plotting B
C
2
3
(Union).
df.expanding() df.plot.hist() df.plot.scatter(x='w',y='h') D 4 pd.merge(ydf, zdf, how='outer',
Return an Expanding object allowing summary functions to be Histogram for each column Scatter chart using pairs of points
x1 x2 indicator=True)
applied cumulatively.
A 1 .query('_merge == "left_only"')
df.rolling(n)
.drop(columns=['_merge'])
Return a Rolling object allowing summary functions to be
Rows that appear in ydf but not zdf (Setdiff).
applied to windows of length n.
Cheatsheet for pandas (http://pandas.pydata.org/) originally written by Irv Lustig, Princeton Consultants, inspired by Rstudio Data Wrangling Cheatsheet
> I/O > Retrieving Series/DataFrame Information
Python For Data Science Read and Write to CSV Basic Information

Pandas Basics Cheat Sheet >>> pd.read_csv(‘file.csv’, header=None, nrows=5)

>>> df.to_csv('myDataFrame.csv')
>>>
>>>
>>>
df.shape #(rows,columns)

df.index #Describe index

df.columns #Describe DataFrame columns

>>> df.info() #Info on DataFrame

Learn Pandas Basics online at www.DataCamp.com Read and Write to Excel >>> df.count() #Number of non-NA values

>>> pd.read_excel(‘file.xlsx’)

>>> df.to_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1')


Summary
Read multiple sheets from the same file df.sum() #Sum of values

Pandas
>>>
>>> df.cumsum() #Cummulative sum of values

>>> xlsx = pd.ExcelFile(‘file.xls’)

>>> df.min()/df.max() #Minimum/maximum values

>>> df = pd.read_excel(xlsx, 'Sheet1')


>>> df.idxmin()/df.idxmax() #Minimum/Maximum index value

>>> df.describe() #Summary statistics

The Pandas library is built on NumPy and provides easy-to-use data


structures and data analysis tools for the Python programming language. Read and Write to SQL Query or Database Table >>>
>>>
df.mean() #Mean of values

df.median() #Median of values

Use the following import convention: >>> from sqlalchemy import create_engine

>>> engine = create_engine('sqlite:///:memory:')

>>> import pandas as pd >>>


>>>
pd.read_sql("SELECT * FROM my_table;", engine)

pd.read_sql_table('my_table', engine)
> Applying Functions
>>> pd.read_sql_query("SELECT * FROM my_table;", engine)
read_sql() is a convenience wrapper around read_sql_table() and read_sql_query() >>> f = lambda x: x*2

> Pandas Data Structures >>> df.to_sql('myDf', engine) >>> df.apply(f) #Apply function

>>> df.applymap(f) #Apply function element-wise

Series
> Selection Also see NumPy Arrays
> Data Alignment
A one-dimensional labeled array
a 3
capable of holding any data type b -5 Getting Internal Data Alignment
Index
c 7 >>> s['b'] #Get one element

NA values are introduced in the indices that don’t overlap:


d 4 -5

>>> s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd']) >>> df[1:] #Get subset of a DataFrame
>>> s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])

Country Capital Population


>>> s + s3

1 India New Delhi 1303171035


a 10.0

Dataframe 2 Brazil Brasília 207847528 b NaN

c 5.0

Selecting, Boolean Indexing & Setting


d 7.0
A two-dimensional labeled data structure

with columns of potentially different types


By Position Arithmetic Operations with Fill Methods
Columns Country Capital Population
>>> df.iloc[[0],[0]] #Select single value by row & column

0 Belgium Brussels 11190846 'Belgium'

You can also do the internal data alignment yourself with the help of the fill methods:
Index 1 India New Delhi 1303171035 >>> df.iat([0],[0])
>>> s.add(s3, fill_values=0)

'Belgium' a 10.0

2 Brazil Brasilia 207847528


b -5.0

By Label
>>> data = {'Country': ['Belgium', 'India', 'Brazil'],
c 5.0

'Capital': ['Brussels', 'New Delhi', 'Brasília'],


>>> df.loc[[0], ['Country']] #Select single value by row & column labels
d 7.0

'Population': [11190846, 1303171035, 207847528]}


'Belgium'
>>> s.sub(s3, fill_value=2)

>>> df = pd.DataFrame(data,
>>> df.at([0], ['Country'])
>>> s.div(s3, fill_value=4)

columns=['Country', 'Capital', 'Population']) 'Belgium' >>> s.mul(s3, fill_value=3)

By Label/Position

> Dropping
>>> df.ix[2] #Select single row of subset of rows

Country Brazil

Capital Brasília

Population 207847528

>>> s.drop(['a', 'c']) #Drop values from rows (axis=0)


>>> df.ix[:,'Capital'] #Select a single column of subset of columns

>>> df.drop('Country', axis=1) #Drop values from columns(axis=1) 0 Brussels

1 New Delhi

2 Brasília

>>> df.ix[1,'Capital'] #Select rows and columns

> Asking For Help 'New Delhi'

Boolean Indexing
>>> help(pd.Series.loc) >>> s[~(s > 1)] #Series s where value is not >1

>>> s[(s < -1) | (s > 2)] #s where value is <-1 or >2

>>> df[df['Population']>1200000000] #Use filter to adjust DataFrame

> Sort & Rank Setting

>>> s['a'] = 6 #Set index a of Series s to 6

>>> df.sort_index() #Sort by labels along an axis


Learn Data Skills Online at
>>> df.sort_values(by='Country') #Sort by the values along an axis

>>> df.rank() #Assign ranks to entries


www.DataCamp.com
> Exploring Your Data > Pickled Files
Python For Data Science
NumPy Arrays >>> import pickle

>>> with open('pickled_fruit.pkl', 'rb') as file:

pickled_data = pickle.load(file)

Importing Data Cheat Sheet >>> data_array.dtype #Data type of array elements

>>> data_array.shape #Array dimensions

>>> len(data_array) #Length of array

Learn Python online at www.DataCamp.com > Matlab Files


Pandas DataFrames
>>> import scipy.io

>>> df.head() #Return first DataFrame rows


>>> filename = 'workspace.mat'

>>> df.tail() #Return last DataFrame rows


>>> mat = scipy.io.loadmat(filename)

> Importing Data in Python


>>> df.index #Describe index

>>> df.columns #Describe DataFrame columns

> HDF5 Files


>>> df.info() #Info on DataFrame

>>> data_array = data.values #Convert a DataFrame to an a NumPy array


Most of the time, you’ll use either NumPy or pandas to import your data:
>>> import h5py

> SAS File


>>> import numpy as np

>>> import pandas as pd >>> filename = 'H-H1_LOSC_4_v1-815411200-4096.hdf5'

>>> data = h5py.File(filename, 'r')

>>> from sas7bdat import SAS7BDAT

> Help >>> with SAS7BDAT('urbanpop.sas7bdat') as file:

df_sas = file.to_data_frame()
> Exploring Dictionaries
>>> np.info(np.ndarray.dtype)

>>> help(pd.read_csv)
Querying relational databases with pandas
> Stata File
>>> print(mat.keys()) #Print dictionary keys

> Text Files >>> data = pd.read_stata('urbanpop.dta') >>> for key in data.keys(): #Print dictionary keys

print(key)

meta

quality

Plain Text Files


> Excel Spreadsheets strain

>>> pickled_data.values() #Return dictionary values

>>> filename = 'huck_finn.txt'


>>> print(mat.items()) #Returns items in list format of (key, value) tuple pairs
>>> file = open(filename, mode='r') #Open the file for reading
>>> file = 'urbanpop.xlsx'

text = file.read() #Read a file’s contents

>>>
>>> print(file.closed) #Check whether file is closed

>>> data = pd.ExcelFile(file)

>>> df_sheet2 = data.parse('1960-1966',


Accessing Data Items with Keys
>>> file.close() #Close file
skiprows=[0],

>>> print(text) names=['Country',


>>> for key in data ['meta'].keys() #Explore the HDF5
'AAM: War(2002)'])
structure

Using the context manager with >>> df_sheet1 = data.parse(0,


print(key)

>>> with open('huck_finn.txt', 'r') as file:


parse_cols=[0],
Description

print(file.readline()) #Read a single line


skiprows=[0],
DescriptionURL

print(file.readline())
names=['Country']) Detector

print(file.readline()) To access the sheet names, use the sheet_names attribute: Duration

>>> data.sheet_names
GPSstart

Observatory

Table Data: Flat Files Type

UTCstart

Importing Flat Files with NumPy


>>> filename = 'huck_finn.txt'

> Relational Databases #Retrieve the value for a key

>>> print(data['meta']['Description'].value)

>>> file = open(filename, mode='r') #Open the file for reading

>>> from sqlalchemy import create_engine

> Navigating Your FileSystem


>>> text = file.read() #Read a file’s contents

>>> engine = create_engine('sqlite://Northwind.sqlite')


>>> print(file.closed) #Check whether file is closed

>>> file.close() #Close file


Use the table_names() method to fetch a list of table names:
>>> print(text)
>>> table_names = engine.table_names()
Files with one data type Magic Commands
>>> filename = ‘mnist.txt’

>>> data = np.loadtxt(filename,

Querying Relational Databases !ls #List directory contents of files and directories

delimiter=',', #String used to separate values


%cd .. #Change current working directory

skiprows=2, #Skip the first 2 lines


>>> con = engine.connect()
%pwd #Return the current working directory path
usecols=[0,2], #Read the 1st and 3rd column
>>> rs = con.execute("SELECT * FROM Orders")

dtype=str) #The type of the resulting array >>> df = pd.DataFrame(rs.fetchall())

Files with mixed data type


>>>
>>>
df.columns = rs.keys()

con.close()
OS Library
>>> filename = 'titanic.csv'
Using the context manager with >>> import os

>>> data = np.genfromtxt(filename,


>>> path = "/usr/tmp"

delimiter=',',
>>> with engine.connect() as con:

>>> wd = os.getcwd() #Store the name of current directory in a string

names=True, #Look for column header


rs = con.execute("SELECT OrderID FROM Orders")

>>> os.listdir(wd) #Output contents of the directory in a list

dtype=None)
df = pd.DataFrame(rs.fetchmany(size=5))

>>> os.chdir(path) #Change current working directory

>>> data_array = np.recfromcsv(filename)


df.columns = rs.keys()
>>> os.rename("test1.txt", #Rename a file

#The default dtype of the np.recfromcsv() function is None "test2.txt")

Importing Flat Files with Pandas Querying relational databases with pandas >>> os.remove("test1.txt") #Delete an existing file

>>> os.mkdir("newdir") #Create a new directory


>>> filename = 'winequality-red.csv'

>>> data = pd.read_csv(filename,


>>> df = pd.read_sql_query("SELECT * FROM Orders", engine)
nrows=5, #Number of rows of file to read

header=None, #Row number to use as col names

sep='\t', #Delimiter to use

comment='#', #Character to split comments

na_values=[""]) #String to recognize as NA/NaN


Learn Learn
DataData
Skills Online
Skills Online at www.DataCamp.com
at www.DataCamp.com
> Getting started with lists > Getting started with characters and strings
A list is an ordered and changeable sequence of elements. It can hold integers, characters, floats, strings, and even objects.
# Create a string with double or single quotes


Python Basics Creating lists


"DataCamp"

# Embed a quote in string with the escape character \

"He said, \"DataCamp\""

Getting started with Python Cheat Sheet #


x
Create lists with [],
= [1, 3, 2]

elements separated by commas

# Create multi-line strings with triple quotes

"""

Learn Python online at www.DataCamp.com List functions and methods A Frame of Data

Tidy, Mine, Analyze It

x.sorted(x) # Return a sorted copy of the list e.g., [1,2,3]


Now You Have Meaning

x.sort() # Sorts the list in-place (replaces x)


Citation: https://mdsr-book.github.io/haikus.html

> How to use this cheat sheet reversed(x) # Reverse the order of elements in x e.g., [2,3,1]

x.reversed() # Reverse the list in-place

"""

x.count(2) # Count the number of element 2 in the list


str[0] # Get the character at a specific position

Python is the most popular programming language in data science. It is easy to learn and comes with a wide array of str[0:2] # Get a substring from starting to ending index (exclusive)

powerful libraries for data analysis. This cheat sheet provides beginners and intermediate users a guide to starting
using python. Use it to jump-start your journey with python. If you want more detailed Python cheat sheets, check out Selecting list elements
the following cheat sheets below:
Combining and splitting strings
Python lists are zero-indexed (the first element has index 0). For ranges, the first element is included but the last is not.

# Define the list


"Data" + "Framed" # Concatenate strings with +, this returns 'DataFramed'

x = ['a', 'b', 'c', 'd', 'e']


x[1:3] # Select 1st (inclusive) to 3rd (exclusive)
3 * "data " # Repeat strings with *, this returns 'data data data '

x[0] # Select the 0th element in the list


x[2:] # Select the 2nd to the end
"beekeepers".split("e") # Split a string on a delimiter, returns ['b', '', 'k', '', 'p', 'rs']

x[-1] # Select the last element in the list


x[:3] # Select 0th to 3rd (exclusive)

Mutate strings
Importing data in python Data wrangling in pandas

Concatenating lists
str = "Jack and Jill" # Define str

# Define the x and y lists


Returns [1, 3, 6, 10, 15, 21]

> Accessing help and getting object types


x + y #
str.upper() # Convert a string to uppercase, returns 'JACK AND JILL'

x = [1, 3, 6]
3 * x # Returns [1, 3, 6, 1, 3, 6, 1, 3, 6] str.lower() # Convert a string to lowercase, returns 'jack and jill'

y = [10, 15, 21]


str.title() # Convert a string to title case, returns 'Jack And Jill'

1 + 1 # Everything after the hash symbol is ignored by Python


str.replace("J", "P") # Replaces matches of a substring with another, returns 'Pack and Pill'

help(max) # Display the documentation for the max function

type('a') # Get the type of an object — this returns str > Getting started with dictionaries
A dictionary stores data values in key-value pairs. That is, unlike lists which are indexed by position, dictionaries are indexed
> Getting started with DataFrames
> Importing packages by their keys, the names of which must be unique.
Pandas is a fast and powerful package for data analysis and manipulation in python. To import the package, you can
use import pandas as pd. A pandas DataFrame is a structure that contains two-dimensional data stored as rows and
Python packages are a collection of useful tools developed by the open-source community. They extend the
Creating dictionaries columns. A pandas series is a structure that contains one-dimensional data.

capabilities of the python language. To install a new package (for example, pandas), you can go to your command
prompt and type in pip install pandas. Once a package is installed, you can import it as follows.

# Create
{'a': 1,
a dictionary with {}

'b': 4, 'c': 9}

Creating DataFrames
import pandas # Import a package without an alias

import pandas as pd # Import a package with an alias

from pandas import DataFrame # Import an object from a package

Dictionary functions and methods # Create a dataframe from a


pd.DataFrame({

dictionary
# Create a dataframe from a list
pd.DataFrame([

of dictionaries

'a': [1, 2, 3],


{'a': 1, 'b': 4, 'c': 'x'},

x = {'a': 1, 'b': 2, 'c': 3} # Define the x ditionary


'b': np.array([4, 4, 6]),
{'a': 1, 'b': 4, 'c': 'x'},

x.keys() # Get the keys of a dictionary, returns dict_keys(['a', 'b', 'c'])


'c': ['x', 'x', 'y']
{'a': 3, 'b': 6, 'c': 'y'}

> The working directory x.values() # Get the values of a dictionary, returns dict_values([1, 2, 3])
}) ])

Selecting dictionary elements Selecting DataFrame Elements


The working directory is the default file path that python reads or saves files into. An example of the working directory
is ”C://file/path". The os library is needed to set and get the working directory.

x['a'] # 1 # Get a value from a dictionary by specifying the key


Select a row, column or element from a dataframe. Remember: all positions are counted from zero, not one.

import os # Import the operating system package

# Select the 3rd row

os.getcwd() # Get the current directory



df.iloc[3]

os.setcwd("new/working/directory") # Set the working directory to a new file path


> NumPy arrays # Select one column by name

df['col']

# Select multiple columns by names

> Operators NumPy is a python package for scientific computing. It provides multidimensional array objects and efficient operations
on them. To import NumPy, you can run this Python code import numpy as np

df[['col1', 'col2']]

# Select 2nd column

df.iloc[:, 2]

Arithmetic operators Creating arrays


# Select the element in the 3rd row, 2nd column

df.iloc[3, 2]

102 + 37 # Add two numbers with +


22 // 7 # Integer divide a number with //

Convert a python list to a NumPy array



Manipulating DataFrames
102 - 37 # Subtract a number with -
3 ^ 4 # Raise to the power with ^
#
4 * 6 # Multiply two numbers with *
22 % 7 # Returns 1 # Get the remainder after np.array([1, 2, 3]) # Returns array([1, 2, 3])

22 / 7 # Divide a number by another with /


division with %
# Return a sequence from start (inclusive) to end (exclusive)

np.arange(1,5) # Returns array([1, 2, 3, 4])
# Concatenate DataFrames vertically
# Calculate the mean of each column

# Return a stepped sequence from start (inclusive) to end (exclusive)
 pd.concat([df, df])
df.mean()

Assignment operators np.arange(1,5,2) # Returns array([1, 3])


# Concatenate DataFrames horizontally
# Get summary statistics by column

# Repeat values n times
 pd.concat([df,df],axis="columns")


df.agg(aggregation_function)

a = 5 # Assign a value to a
np.repeat([1, 3, 6], 3) # Returns array([1, 1, 1, 3, 3, 3, 6, 6, 6])
# Get rows matching a condition
# Get unique rows

x[0] = 1 # Change the value of an item in a list # Repeat values n times


df.query('logical_condition')
df.drop_duplicates()

np.tile([1, 3, 6], 3) # Returns array([1, 3, 6, 1, 3, 6, 1, 3, 6])


# Drop columns by name
# Sort by values in a column

Numeric comparison operators df.drop(columns=['col_name'])

# Rename columns

df.sort_values(by='col_name')

# Get rows with largest values


Math functions and methods
in a column

3 == 3 # Test for equality with ==


3 >= 3 # Test greater than or equal to with >=

> df.rename(columns={"oldname": "newname"})

# Add a new column

df.nlargest(n, 'col_name')

3 != 3 # Test for inequality with !=


3 < 4 # Test less than with <
df.assign(temp_f=9 / 5 * df['temp_c'] + 32)
All functions take an array as the input.
3 > 1 # Test greater than with >
3 <= 4 # Test less than or equal to with <=
np.log(x) # Calculate logarithm
np.quantile(x, q) # Calculate q-th quantile

np.exp(x) # Calculate exponential


np.round(x, n) # Round to n decimal places

Logical operators np.max(x) # Get maximum value


np.var(x) # Calculate variance

np.min(x) # Get minimum value


np.std(x) # Calculate standard deviation

~(2 == 2) # Logical NOT with ~


(1 >= 1) | (1 < 1) # Logical OR with |
np.sum(x) # Calculate sum

(1 != 1) & (1 < 1) # Logical AND with & (1 != 1) ^ (1 < 1) # Logical XOR with ^ np.mean(x) # Calculate mean
Python For Data Science Cheat Sheet Lists Also see NumPy Arrays Libraries
>>> a = 'is' Import libraries
Python Basics >>> b = 'nice' >>> import numpy Data analysis Machine learning
Learn More Python for Data Science Interactively at www.datacamp.com >>> my_list = ['my', 'list', a, b] >>> import numpy as np
>>> my_list2 = [[4,5,6,7], [3,4,5,6]] Selective import
>>> from math import pi Scientific computing 2D plotting
Variables and Data Types Selecting List Elements Index starts at 0
Subset Install Python
Variable Assignment
>>> my_list[1] Select item at index 1
>>> x=5
>>> my_list[-3] Select 3rd last item
>>> x
Slice
5 >>> my_list[1:3] Select items at index 1 and 2
Calculations With Variables >>> my_list[1:] Select items after index 0
>>> my_list[:3] Select items before index 3 Leading open data science platform Free IDE that is included Create and share
>>> x+2 Sum of two variables
>>> my_list[:] Copy my_list powered by Python with Anaconda documents with live code,
7 visualizations, text, ...
>>> x-2 Subtraction of two variables
Subset Lists of Lists
>>> my_list2[1][0] my_list[list][itemOfList]
3
>>> my_list2[1][:2] Numpy Arrays Also see Lists
>>> x*2 Multiplication of two variables
>>> my_list = [1, 2, 3, 4]
10 List Operations >>> my_array = np.array(my_list)
>>> x**2 Exponentiation of a variable
25 >>> my_list + my_list >>> my_2darray = np.array([[1,2,3],[4,5,6]])
>>> x%2 Remainder of a variable ['my', 'list', 'is', 'nice', 'my', 'list', 'is', 'nice']
Selecting Numpy Array Elements Index starts at 0
1 >>> my_list * 2
>>> x/float(2) Division of a variable ['my', 'list', 'is', 'nice', 'my', 'list', 'is', 'nice'] Subset
2.5 >>> my_list2 > 4 >>> my_array[1] Select item at index 1
True 2
Types and Type Conversion Slice
List Methods >>> my_array[0:2] Select items at index 0 and 1
str() '5', '3.45', 'True' Variables to strings
my_list.index(a) Get the index of an item array([1, 2])
>>>
int() 5, 3, 1 Variables to integers >>> my_list.count(a) Count an item Subset 2D Numpy arrays
>>> my_list.append('!') Append an item at a time >>> my_2darray[:,0] my_2darray[rows, columns]
my_list.remove('!') Remove an item array([1, 4])
float() 5.0, 1.0 Variables to floats >>>
>>> del(my_list[0:1]) Remove an item Numpy Array Operations
bool() True, True, True >>> my_list.reverse() Reverse the list
Variables to booleans >>> my_array > 3
>>> my_list.extend('!') Append an item array([False, False, False, True], dtype=bool)
>>> my_list.pop(-1) Remove an item >>> my_array * 2
Asking For Help >>> my_list.insert(0,'!') Insert an item array([2, 4, 6, 8])
>>> help(str) >>> my_list.sort() Sort the list >>> my_array + np.array([5, 6, 7, 8])
array([6, 8, 10, 12])
Strings
>>> my_string = 'thisStringIsAwesome' Numpy Array Functions
String Operations Index starts at 0
>>> my_string >>> my_array.shape Get the dimensions of the array
'thisStringIsAwesome' >>> my_string[3] >>> np.append(other_array) Append items to an array
>>> my_string[4:9] >>> np.insert(my_array, 1, 5) Insert items in an array
String Operations >>> np.delete(my_array,[1]) Delete items in an array
String Methods >>> np.mean(my_array) Mean of the array
>>> my_string * 2
'thisStringIsAwesomethisStringIsAwesome' >>> my_string.upper() String to uppercase >>> np.median(my_array) Median of the array
>>> my_string + 'Innit' >>> my_string.lower() String to lowercase >>> my_array.corrcoef() Correlation coefficient
'thisStringIsAwesomeInnit' >>> my_string.count('w') Count String elements >>> np.std(my_array) Standard deviation
>>> 'm' in my_string >>> my_string.replace('e', 'i') Replace String elements
True >>> my_string.strip() Strip whitespaces DataCamp
Learn Python for Data Science Interactively
Working with Different Programming Languages Widgets
Python For Data Science Cheat Sheet Kernels provide computation and communication with front-end interfaces Notebook widgets provide the ability to visualize and control changes
Jupyter Notebook like the notebooks. There are three main kernels: in your data, often as a control like a slider, textbox, etc.
Learn More Python for Data Science Interactively at www.DataCamp.com
You can use them to build interactive GUIs for your notebooks or to
IRkernel IJulia
synchronize stateful and stateless information between Python and
Installing Jupyter Notebook will automatically install the IPython kernel. JavaScript.
Saving/Loading Notebooks Restart kernel Interrupt kernel
Create new notebook Restart kernel & run Interrupt kernel & Download serialized Save notebook
all cells clear all output state of all widget with interactive
Open an existing
Connect back to a models in use widgets
Make a copy of the notebook Restart kernel & run remote notebook
current notebook all cells Embed current
Rename notebook Run other installed
widgets
kernels
Revert notebook to a
Save current notebook
previous checkpoint Command Mode:
and record checkpoint
Download notebook as
Preview of the printed - IPython notebook 15
notebook - Python
- HTML
Close notebook & stop - Markdown 13 14
- reST
running any scripts - LaTeX 1 2 3 4 5 6 7 8 9 10 11 12
- PDF

Writing Code And Text


Code and text are encapsulated by 3 basic cell types: markdown cells, code
cells, and raw NBConvert cells.
Edit Cells Edit Mode: 1. Save and checkpoint 9. Interrupt kernel
2. Insert cell below 10. Restart kernel
3. Cut cell 11. Display characteristics
Cut currently selected cells Copy cells from 4. Copy cell(s) 12. Open command palette
to clipboard clipboard to current 5. Paste cell(s) below 13. Current kernel
cursor position 6. Move cell up 14. Kernel status
Paste cells from Executing Cells 7. Move cell down 15. Log out from notebook server
clipboard above Paste cells from 8. Run current cell
current cell Run selected cell(s) Run current cells down
clipboard below
and create a new one
Paste cells from current cell
below Asking For Help
clipboard on top Run current cells down
Delete current cells
of current cel and create a new one Walk through a UI tour
Split up a cell from above Run all cells
Revert “Delete Cells” List of built-in keyboard
current cursor Run all cells above the Run all cells below
invocation shortcuts
position current cell the current cell Edit the built-in
Merge current cell Merge current cell keyboard shortcuts
Change the cell type of toggle, toggle Notebook help topics
with the one above with the one below current cell scrolling and clear Description of
Move current cell up Move current cell toggle, toggle current outputs markdown available Information on
down scrolling and clear in notebook unofficial Jupyter
Adjust metadata
underlying the Find and replace all output Notebook extensions
Python help topics
current notebook in selected cells IPython help topics
View Cells
Remove cell Copy attachments of NumPy help topics
attachments current cell Toggle display of Jupyter SciPy help topics
Toggle display of toolbar Matplotlib help topics
Paste attachments of Insert image in logo and filename
SymPy help topics
current cell selected cells Toggle display of cell Pandas help topics
action icons:
Insert Cells - None About Jupyter Notebook
- Edit metadata
Toggle line numbers - Raw cell format
Add new cell above the Add new cell below the - Slideshow
current one in cells - Attachments
current one DataCamp
- Tags
Learn Python for Data Science Interactively
Python For Data Science Cheat Sheet Inspecting Your Array Subsetting, Slicing, Indexing Also see Lists
>>> a.shape Array dimensions Subsetting
NumPy Basics >>>
>>>
len(a)
b.ndim
Length of array
Number of array dimensions
>>> a[2]
3
1 2 3 Select the element at the 2nd index
Learn Python for Data Science Interactively at www.DataCamp.com >>> e.size Number of array elements >>> b[1,2] 1.5 2 3 Select the element at row 0 column 2
>>> b.dtype Data type of array elements 6.0 4 5 6 (equivalent to b[1][2])
>>> b.dtype.name Name of data type
>>> b.astype(int) Convert an array to a different type Slicing
NumPy >>> a[0:2]
array([1, 2])
1 2 3 Select items at index 0 and 1
2
The NumPy library is the core library for scientific computing in Asking For Help >>> b[0:2,1] 1.5 2 3 Select items at rows 0 and 1 in column 1
>>> np.info(np.ndarray.dtype) array([ 2., 5.]) 4 5 6
Python. It provides a high-performance multidimensional array
Array Mathematics
1.5 2 3
>>> b[:1] Select all items at row 0
object, and tools for working with these arrays. array([[1.5, 2., 3.]]) 4 5 6 (equivalent to b[0:1, :])
Arithmetic Operations >>> c[1,...] Same as [1,:,:]
Use the following import convention: array([[[ 3., 2., 1.],
>>> import numpy as np [ 4., 5., 6.]]])
>>> g = a - b Subtraction
array([[-0.5, 0. , 0. ], >>> a[ : :-1] Reversed array a
NumPy Arrays [-3. , -3. , -3. ]])
array([3, 2, 1])

>>> np.subtract(a,b) Boolean Indexing


1D array 2D array 3D array Subtraction
>>> a[a<2] Select elements from a less than 2
>>> b + a Addition 1 2 3
array([[ 2.5, 4. , 6. ], array([1])
axis 1 axis 2
1 2 3 axis 1 [ 5. , 7. , 9. ]]) Fancy Indexing
1.5 2 3 >>> np.add(b,a) Addition >>> b[[1, 0, 1, 0],[0, 1, 2, 0]] Select elements (1,0),(0,1),(1,2) and (0,0)
axis 0 axis 0 array([ 4. , 2. , 6. , 1.5])
4 5 6 >>> a / b Division
array([[ 0.66666667, 1. , 1. ], >>> b[[1, 0, 1, 0]][:,[0,1,2,0]] Select a subset of the matrix’s rows
[ 0.25 , 0.4 , 0.5 ]]) array([[ 4. ,5. , 6. , 4. ], and columns
>>> np.divide(a,b) Division [ 1.5, 2. , 3. , 1.5],
Creating Arrays >>> a * b
array([[ 1.5, 4. , 9. ],
Multiplication
[ 4. , 5.
[ 1.5, 2.
,
,
6.
3.
,
,
4. ],
1.5]])

>>> a = np.array([1,2,3]) [ 4. , 10. , 18. ]])


>>> b = np.array([(1.5,2,3), (4,5,6)], dtype = float) >>> np.multiply(a,b) Multiplication Array Manipulation
>>> c = np.array([[(1.5,2,3), (4,5,6)], [(3,2,1), (4,5,6)]], >>> np.exp(b) Exponentiation
dtype = float) >>> np.sqrt(b) Square root Transposing Array
>>> np.sin(a) Print sines of an array >>> i = np.transpose(b) Permute array dimensions
Initial Placeholders >>> np.cos(b) Element-wise cosine >>> i.T Permute array dimensions
>>> np.log(a) Element-wise natural logarithm
>>> np.zeros((3,4)) Create an array of zeros >>> e.dot(f) Dot product
Changing Array Shape
>>> np.ones((2,3,4),dtype=np.int16) Create an array of ones array([[ 7., 7.], >>> b.ravel() Flatten the array
>>> d = np.arange(10,25,5) Create an array of evenly [ 7., 7.]]) >>> g.reshape(3,-2) Reshape, but don’t change data
spaced values (step value)
>>> np.linspace(0,2,9) Create an array of evenly Comparison Adding/Removing Elements
spaced values (number of samples) >>> h.resize((2,6)) Return a new array with shape (2,6)
>>> e = np.full((2,2),7) Create a constant array >>> a == b Element-wise comparison >>> np.append(h,g) Append items to an array
>>> f = np.eye(2) Create a 2X2 identity matrix array([[False, True, True], >>> np.insert(a, 1, 5) Insert items in an array
>>> np.random.random((2,2)) Create an array with random values [False, False, False]], dtype=bool) >>> np.delete(a,[1]) Delete items from an array
>>> np.empty((3,2)) Create an empty array >>> a < 2 Element-wise comparison
array([True, False, False], dtype=bool) Combining Arrays
>>> np.array_equal(a, b) Array-wise comparison >>> np.concatenate((a,d),axis=0) Concatenate arrays
I/O array([ 1, 2,
>>> np.vstack((a,b))
3, 10, 15, 20])
Stack arrays vertically (row-wise)
Aggregate Functions array([[ 1. , 2. , 3. ],
Saving & Loading On Disk [ 1.5, 2. , 3. ],
>>> a.sum() Array-wise sum [ 4. , 5. , 6. ]])
>>> np.save('my_array', a) >>> a.min() Array-wise minimum value >>> np.r_[e,f] Stack arrays vertically (row-wise)
>>> np.savez('array.npz', a, b) >>> b.max(axis=0) Maximum value of an array row >>> np.hstack((e,f)) Stack arrays horizontally (column-wise)
>>> np.load('my_array.npy') >>> b.cumsum(axis=1) Cumulative sum of the elements array([[ 7., 7., 1., 0.],
>>> a.mean() Mean [ 7., 7., 0., 1.]])
Saving & Loading Text Files >>> b.median() Median >>> np.column_stack((a,d)) Create stacked column-wise arrays
>>> np.loadtxt("myfile.txt") >>> a.corrcoef() Correlation coefficient array([[ 1, 10],
>>> np.std(b) Standard deviation [ 2, 15],
>>> np.genfromtxt("my_file.csv", delimiter=',') [ 3, 20]])
>>> np.savetxt("myarray.txt", a, delimiter=" ") >>> np.c_[a,d] Create stacked column-wise arrays
Copying Arrays Splitting Arrays
Data Types >>> h = a.view() Create a view of the array with the same data >>> np.hsplit(a,3) Split the array horizontally at the 3rd
>>> np.copy(a) Create a copy of the array [array([1]),array([2]),array([3])] index
>>> np.int64 Signed 64-bit integer types >>> np.vsplit(c,2) Split the array vertically at the 2nd index
>>> np.float32 Standard double-precision floating point >>> h = a.copy() Create a deep copy of the array [array([[[ 1.5, 2. , 1. ],
>>> np.complex Complex numbers represented by 128 floats [ 4. , 5. , 6. ]]]),
array([[[ 3., 2., 3.],
>>>
>>>
np.bool
np.object
Boolean type storing TRUE and FALSE values
Python object type Sorting Arrays [ 4., 5., 6.]]])]

>>> np.string_ Fixed-length string type >>> a.sort() Sort an array


>>> np.unicode_ Fixed-length unicode type >>> c.sort(axis=0) Sort the elements of an array's axis DataCamp
Learn Python for Data Science Interactively
Python For Data Science Cheat Sheet Linear Algebra Also see NumPy
You’ll use the linalg and sparse modules. Note that scipy.linalg contains and expands on numpy.linalg.
SciPy - Linear Algebra >>> from scipy import linalg, sparse Matrix Functions
Learn More Python for Data Science Interactively at www.datacamp.com
Creating Matrices Addition
>>> np.add(A,D) Addition
>>> A = np.matrix(np.random.random((2,2)))
SciPy >>> B = np.asmatrix(b) Subtraction
>>> C = np.mat(np.random.random((10,5))) >>> np.subtract(A,D) Subtraction
The SciPy library is one of the core packages for >>> D = np.mat([[3,4], [5,6]]) Division
scientific computing that provides mathematical >>> np.divide(A,D) Division
Basic Matrix Routines Multiplication
algorithms and convenience functions built on the
>>> np.multiply(D,A) Multiplication
NumPy extension of Python. Inverse >>> np.dot(A,D) Dot product
>>> A.I Inverse >>> np.vdot(A,D) Vector dot product
>>> linalg.inv(A) Inverse
Interacting With NumPy Also see NumPy >>> A.T Tranpose matrix >>> np.inner(A,D) Inner product
>>> np.outer(A,D) Outer product
>>> import numpy as np >>> A.H Conjugate transposition >>> np.tensordot(A,D) Tensor dot product
>>> a = np.array([1,2,3]) >>> np.trace(A) Trace >>> np.kron(A,D) Kronecker product
>>> b = np.array([(1+5j,2j,3j), (4j,5j,6j)])
>>> c = np.array([[(1.5,2,3), (4,5,6)], [(3,2,1), (4,5,6)]]) Norm Exponential Functions
>>> linalg.norm(A) Frobenius norm >>> linalg.expm(A) Matrix exponential
Index Tricks >>> linalg.norm(A,1) L1 norm (max column sum) >>> linalg.expm2(A) Matrix exponential (Taylor Series)
>>> linalg.norm(A,np.inf) L inf norm (max row sum) >>> linalg.expm3(D) Matrix exponential (eigenvalue
>>> np.mgrid[0:5,0:5] Create a dense meshgrid decomposition)
>>> np.ogrid[0:2,0:2] Create an open meshgrid Rank Logarithm Function
>>> np.r_[[3,[0]*5,-1:1:10j] Stack arrays vertically (row-wise) >>> np.linalg.matrix_rank(C) Matrix rank >>> linalg.logm(A) Matrix logarithm
>>> np.c_[b,c] Create stacked column-wise arrays Determinant Trigonometric Tunctions
>>> linalg.det(A) Determinant >>> linalg.sinm(D) Matrix sine
Shape Manipulation Solving linear problems >>> linalg.cosm(D) Matrix cosine
>>> np.transpose(b) Permute array dimensions >>> linalg.solve(A,b) Solver for dense matrices >>> linalg.tanm(A) Matrix tangent
>>> b.flatten() Flatten the array >>> E = np.mat(a).T Solver for dense matrices Hyperbolic Trigonometric Functions
>>> np.hstack((b,c)) Stack arrays horizontally (column-wise) >>> linalg.lstsq(D,E) Least-squares solution to linear matrix >>> linalg.sinhm(D) Hypberbolic matrix sine
>>> np.vstack((a,b)) Stack arrays vertically (row-wise) equation >>> linalg.coshm(D) Hyperbolic matrix cosine
>>> np.hsplit(c,2) Split the array horizontally at the 2nd index Generalized inverse >>> linalg.tanhm(A) Hyperbolic matrix tangent
>>> np.vpslit(d,2) Split the array vertically at the 2nd index >>> linalg.pinv(C) Compute the pseudo-inverse of a matrix Matrix Sign Function
(least-squares solver) >>> np.sigm(A) Matrix sign function
Polynomials >>> linalg.pinv2(C) Compute the pseudo-inverse of a matrix
>>> from numpy import poly1d (SVD) Matrix Square Root
>>> linalg.sqrtm(A) Matrix square root
>>> p = poly1d([3,4,5]) Create a polynomial object
Creating Sparse Matrices Arbitrary Functions
Vectorizing Functions >>> linalg.funm(A, lambda x: x*x) Evaluate matrix function
>>> F = np.eye(3, k=1) Create a 2X2 identity matrix
>>> def myfunc(a):
if a < 0: >>> G = np.mat(np.identity(2)) Create a 2x2 identity matrix Decompositions
return a*2 >>> C[C > 0.5] = 0
else: >>> H = sparse.csr_matrix(C)
return a/2
Compressed Sparse Row matrix Eigenvalues and Eigenvectors
>>> I = sparse.csc_matrix(D) Compressed Sparse Column matrix >>> la, v = linalg.eig(A) Solve ordinary or generalized
>>> np.vectorize(myfunc) Vectorize functions >>> J = sparse.dok_matrix(A) Dictionary Of Keys matrix eigenvalue problem for square matrix
>>> E.todense() Sparse matrix to full matrix >>> l1, l2 = la Unpack eigenvalues
Type Handling >>> sparse.isspmatrix_csc(A) Identify sparse matrix >>> v[:,0] First eigenvector
>>> v[:,1] Second eigenvector
>>> np.real(c) Return the real part of the array elements
>>> np.imag(c) Return the imaginary part of the array elements Sparse Matrix Routines >>> linalg.eigvals(A) Unpack eigenvalues
>>> np.real_if_close(c,tol=1000) Return a real array if complex parts close to 0 Singular Value Decomposition
>>> np.cast['f'](np.pi) Cast object to a data type Inverse >>> U,s,Vh = linalg.svd(B) Singular Value Decomposition (SVD)
>>> sparse.linalg.inv(I) Inverse >>> M,N = B.shape
Other Useful Functions Norm >>> Sig = linalg.diagsvd(s,M,N) Construct sigma matrix in SVD
>>> sparse.linalg.norm(I) Norm LU Decomposition
>>> np.angle(b,deg=True) Return the angle of the complex argument >>> P,L,U = linalg.lu(C) LU Decomposition
>>> g = np.linspace(0,np.pi,num=5) Create an array of evenly spaced values
Solving linear problems
(number of samples) >>> sparse.linalg.spsolve(H,I) Solver for sparse matrices
>>> g [3:] += np.pi
>>> np.unwrap(g) Unwrap Sparse Matrix Decompositions
>>> np.logspace(0,10,3) Create an array of evenly spaced values (log scale) Sparse Matrix Functions
>>> la, v = sparse.linalg.eigs(F,1) Eigenvalues and eigenvectors
>>> np.select([c<4],[c*2]) Return values from a list of arrays depending on >>> sparse.linalg.expm(I) Sparse matrix exponential >>> sparse.linalg.svds(H, 2) SVD
conditions
>>> misc.factorial(a) Factorial
>>> Combine N things taken at k time
>>>
misc.comb(10,3,exact=True)
misc.central_diff_weights(3) Weights for Np-point central derivative Asking For Help DataCamp
>>> misc.derivative(myfunc,1.0) Find the n-th derivative of a function at a point >>> help(scipy.linalg.diagsvd)
>>> np.info(np.matrix) Learn Python for Data Science Interactively
Python For Data Science Cheat Sheet Asking For Help Dropping
>>> help(pd.Series.loc)
>>> s.drop(['a', 'c']) Drop values from rows (axis=0)
Pandas Basics Selection Also see NumPy Arrays >>> df.drop('Country', axis=1) Drop values from columns(axis=1)
Learn Python for Data Science Interactively at www.DataCamp.com
Getting
>>> s['b'] Get one element Sort & Rank
-5
Pandas >>> df.sort_index() Sort by labels along an axis
>>> df.sort_values(by='Country') Sort by the values along an axis
>>> df[1:] Get subset of a DataFrame
The Pandas library is built on NumPy and provides easy-to-use Country Capital Population >>> df.rank() Assign ranks to entries
data structures and data analysis tools for the Python 1 India New Delhi 1303171035
2 Brazil Brasília 207847528
programming language. Retrieving Series/DataFrame Information
Selecting, Boolean Indexing & Setting Basic Information
Use the following import convention: By Position >>> df.shape (rows,columns)
>>> import pandas as pd >>> df.iloc([0],[0]) Select single value by row & >>> df.index Describe index
'Belgium' column >>> df.columns Describe DataFrame columns
Pandas Data Structures >>> df.iat([0],[0])
>>>
>>>
df.info()
df.count()
Info on DataFrame
Number of non-NA values
Series 'Belgium'
Summary
A one-dimensional labeled array a 3 By Label
>>> df.loc([0], ['Country']) Select single value by row & >>> df.sum() Sum of values
capable of holding any data type b -5
'Belgium' column labels >>> df.cumsum() Cummulative sum of values
>>> df.min()/df.max() Minimum/maximum values
c 7 >>> df.at([0], ['Country']) >>> df.idxmin()/df.idxmax()
Index Minimum/Maximum index value
d 4 'Belgium' >>> df.describe() Summary statistics
>>> df.mean() Mean of values
>>> s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
By Label/Position >>> df.median() Median of values
>>> df.ix[2] Select single row of
DataFrame Country
Capital
Brazil
Brasília
subset of rows Applying Functions
Population 207847528 >>> f = lambda x: x*2
Columns
Country Capital Population A two-dimensional labeled >>> df.ix[:,'Capital'] Select a single column of >>> df.apply(f) Apply function
>>> df.applymap(f) Apply function element-wise
data structure with columns 0 Brussels subset of columns
0 Belgium Brussels 11190846 1 New Delhi
of potentially different types 2 Brasília Data Alignment
1 India New Delhi 1303171035
Index >>> df.ix[1,'Capital'] Select rows and columns
2 Brazil Brasília 207847528 Internal Data Alignment
'New Delhi'
NA values are introduced in the indices that don’t overlap:
Boolean Indexing
>>> data = {'Country': ['Belgium', 'India', 'Brazil'], >>> s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])
>>> s[~(s > 1)] Series s where value is not >1
'Capital': ['Brussels', 'New Delhi', 'Brasília'], >>> s[(s < -1) | (s > 2)] s where value is <-1 or >2 >>> s + s3
'Population': [11190846, 1303171035, 207847528]} >>> df[df['Population']>1200000000] Use filter to adjust DataFrame a 10.0
b NaN
>>> df = pd.DataFrame(data, Setting
c 5.0
columns=['Country', 'Capital', 'Population']) >>> s['a'] = 6 Set index a of Series s to 6
d 7.0

I/O Arithmetic Operations with Fill Methods


You can also do the internal data alignment yourself with
Read and Write to CSV Read and Write to SQL Query or Database Table
the help of the fill methods:
>>> pd.read_csv('file.csv', header=None, nrows=5) >>> from sqlalchemy import create_engine >>> s.add(s3, fill_value=0)
>>> df.to_csv('myDataFrame.csv') >>> engine = create_engine('sqlite:///:memory:') a 10.0
>>> pd.read_sql("SELECT * FROM my_table;", engine) b -5.0
Read and Write to Excel c 5.0
>>> pd.read_sql_table('my_table', engine) d 7.0
>>> pd.read_excel('file.xlsx') >>> pd.read_sql_query("SELECT * FROM my_table;", engine) >>> s.sub(s3, fill_value=2)
>>> pd.to_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1') >>> s.div(s3, fill_value=4)
read_sql()is a convenience wrapper around read_sql_table() and
Read multiple sheets from the same file >>> s.mul(s3, fill_value=3)
read_sql_query()
>>> xlsx = pd.ExcelFile('file.xls')
>>> df = pd.read_excel(xlsx, 'Sheet1') >>> pd.to_sql('myDf', engine) DataCamp
Learn Python for Data Science Interactively
Python For Data Science Cheat Sheet Create Your Model Evaluate Your Model’s Performance
Supervised Learning Estimators Classification Metrics
Scikit-Learn
Learn Python for data science Interactively at www.DataCamp.com Linear Regression Accuracy Score
>>> from sklearn.linear_model import LinearRegression >>> knn.score(X_test, y_test) Estimator score method
>>> lr = LinearRegression(normalize=True) >>> from sklearn.metrics import accuracy_score Metric scoring functions
>>> accuracy_score(y_test, y_pred)
Support Vector Machines (SVM)
Scikit-learn >>> from sklearn.svm import SVC Classification Report
>>> svc = SVC(kernel='linear') >>> from sklearn.metrics import classification_report Precision, recall, f1-score
Scikit-learn is an open source Python library that Naive Bayes >>> print(classification_report(y_test, y_pred)) and support
implements a range of machine learning, >>> from sklearn.naive_bayes import GaussianNB Confusion Matrix
>>> gnb = GaussianNB() >>> from sklearn.metrics import confusion_matrix
preprocessing, cross-validation and visualization >>> print(confusion_matrix(y_test, y_pred))
algorithms using a unified interface. KNN
>>> from sklearn import neighbors Regression Metrics
A Basic Example >>> knn = neighbors.KNeighborsClassifier(n_neighbors=5)
>>> from sklearn import neighbors, datasets, preprocessing
Mean Absolute Error
>>> from sklearn.model_selection import train_test_split Unsupervised Learning Estimators >>> from sklearn.metrics import mean_absolute_error
>>> from sklearn.metrics import accuracy_score >>> y_true = [3, -0.5, 2]
>>> iris = datasets.load_iris() Principal Component Analysis (PCA) >>> mean_absolute_error(y_true, y_pred)
>>> X, y = iris.data[:, :2], iris.target >>> from sklearn.decomposition import PCA Mean Squared Error
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33) >>> pca = PCA(n_components=0.95) >>> from sklearn.metrics import mean_squared_error
>>> scaler = preprocessing.StandardScaler().fit(X_train) >>> mean_squared_error(y_test, y_pred)
>>> X_train = scaler.transform(X_train)
K Means
>>> X_test = scaler.transform(X_test) >>> from sklearn.cluster import KMeans R² Score
>>> knn = neighbors.KNeighborsClassifier(n_neighbors=5) >>> k_means = KMeans(n_clusters=3, random_state=0) >>> from sklearn.metrics import r2_score
>>> r2_score(y_true, y_pred)
>>> knn.fit(X_train, y_train)
>>> y_pred = knn.predict(X_test)
>>> accuracy_score(y_test, y_pred) Model Fitting Clustering Metrics
Adjusted Rand Index
Supervised learning >>> from sklearn.metrics import adjusted_rand_score
Loading The Data Also see NumPy & Pandas >>> lr.fit(X, y) Fit the model to the data
>>> adjusted_rand_score(y_true, y_pred)
>>> knn.fit(X_train, y_train)
Your data needs to be numeric and stored as NumPy arrays or SciPy sparse >>> svc.fit(X_train, y_train) Homogeneity
>>> from sklearn.metrics import homogeneity_score
matrices. Other types that are convertible to numeric arrays, such as Pandas Unsupervised Learning >>> homogeneity_score(y_true, y_pred)
DataFrame, are also acceptable. >>> k_means.fit(X_train) Fit the model to the data
>>> pca_model = pca.fit_transform(X_train) Fit to data, then transform it V-measure
>>> import numpy as np >>> from sklearn.metrics import v_measure_score
>>> X = np.random.random((10,5)) >>> metrics.v_measure_score(y_true, y_pred)
>>> y = np.array(['M','M','F','F','M','F','M','M','F','F','F'])
>>> X[X < 0.7] = 0 Prediction Cross-Validation
>>> from sklearn.cross_validation import cross_val_score
Supervised Estimators >>> print(cross_val_score(knn, X_train, y_train, cv=4))
Training And Test Data >>> y_pred = svc.predict(np.random.random((2,5))) Predict labels
>>> y_pred = lr.predict(X_test)
>>> print(cross_val_score(lr, X, y, cv=2))
Predict labels
>>> from sklearn.model_selection import train_test_split >>> y_pred = knn.predict_proba(X_test) Estimate probability of a label
>>> X_train, X_test, y_train, y_test = train_test_split(X,
y, Unsupervised Estimators Tune Your Model
random_state=0) >>> y_pred = k_means.predict(X_test) Predict labels in clustering algos Grid Search
>>> from sklearn.grid_search import GridSearchCV
>>> params = {"n_neighbors": np.arange(1,3),
Preprocessing The Data "metric": ["euclidean", "cityblock"]}
>>> grid = GridSearchCV(estimator=knn,
Standardization Encoding Categorical Features param_grid=params)
>>> grid.fit(X_train, y_train)
>>> from sklearn.preprocessing import StandardScaler >>> from sklearn.preprocessing import LabelEncoder >>> print(grid.best_score_)
>>> scaler = StandardScaler().fit(X_train) >>> print(grid.best_estimator_.n_neighbors)
>>> enc = LabelEncoder()
>>> standardized_X = scaler.transform(X_train) >>> y = enc.fit_transform(y)
>>> standardized_X_test = scaler.transform(X_test) Randomized Parameter Optimization
Normalization Imputing Missing Values >>> from sklearn.grid_search import RandomizedSearchCV
>>> params = {"n_neighbors": range(1,5),
>>> from sklearn.preprocessing import Normalizer "weights": ["uniform", "distance"]}
>>> from sklearn.preprocessing import Imputer >>> rsearch = RandomizedSearchCV(estimator=knn,
>>> scaler = Normalizer().fit(X_train) >>> imp = Imputer(missing_values=0, strategy='mean', axis=0) param_distributions=params,
>>> normalized_X = scaler.transform(X_train) >>> imp.fit_transform(X_train) cv=4,
>>> normalized_X_test = scaler.transform(X_test) n_iter=8,
random_state=5)
Binarization Generating Polynomial Features >>> rsearch.fit(X_train, y_train)
>>> print(rsearch.best_score_)
>>> from sklearn.preprocessing import Binarizer >>> from sklearn.preprocessing import PolynomialFeatures
>>> binarizer = Binarizer(threshold=0.0).fit(X) >>> poly = PolynomialFeatures(5)
>>> binary_X = binarizer.transform(X) >>> poly.fit_transform(X) DataCamp
Learn Python for Data Science Interactively
Python For Data Science Cheat Sheet Plot Anatomy & Workflow
Plot Anatomy Workflow
Matplotlib Axes/Subplot The basic steps to creating plots with matplotlib are:
Learn Python Interactively at www.DataCamp.com 1 Prepare data 2 Create plot 3 Plot 4 Customize plot 5 Save plot 6 Show plot
>>> import matplotlib.pyplot as plt
>>> x = [1,2,3,4] Step 1
>>> y = [10,20,25,30]
>>> fig = plt.figure() Step 2
Matplotlib Y-axis Figure >>> ax = fig.add_subplot(111) Step 3
>>> ax.plot(x, y, color='lightblue', linewidth=3) Step 3, 4
Matplotlib is a Python 2D plotting library which produces >>> ax.scatter([2,4,6],
publication-quality figures in a variety of hardcopy formats [5,15,25],
color='darkgreen',
and interactive environments across marker='^')
platforms. >>> ax.set_xlim(1, 6.5)
X-axis
>>> plt.savefig('foo.png')

1 Prepare The Data Also see Lists & NumPy


>>> plt.show() Step 6

1D Data 4 Customize Plot


>>> import numpy as np Colors, Color Bars & Color Maps Mathtext
>>> x = np.linspace(0, 10, 100)
>>> y = np.cos(x) >>> plt.plot(x, x, x, x**2, x, x**3) >>> plt.title(r'$sigma_i=15$', fontsize=20)
>>> z = np.sin(x) >>> ax.plot(x, y, alpha = 0.4)
>>> ax.plot(x, y, c='k') Limits, Legends & Layouts
2D Data or Images >>> fig.colorbar(im, orientation='horizontal')
>>> im = ax.imshow(img, Limits & Autoscaling
>>> data = 2 * np.random.random((10, 10)) cmap='seismic')
>>> data2 = 3 * np.random.random((10, 10)) >>> ax.margins(x=0.0,y=0.1) Add padding to a plot
>>> Y, X = np.mgrid[-3:3:100j, -3:3:100j] >>> ax.axis('equal') Set the aspect ratio of the plot to 1
Markers >>> ax.set(xlim=[0,10.5],ylim=[-1.5,1.5]) Set limits for x-and y-axis
>>> U = -1 - X**2 + Y
>>> V = 1 + X - Y**2 >>> fig, ax = plt.subplots() >>> ax.set_xlim(0,10.5) Set limits for x-axis
>>> from matplotlib.cbook import get_sample_data >>> ax.scatter(x,y,marker=".") Legends
>>> img = np.load(get_sample_data('axes_grid/bivariate_normal.npy')) >>> ax.plot(x,y,marker="o") >>> ax.set(title='An Example Axes', Set a title and x-and y-axis labels
ylabel='Y-Axis',
Linestyles xlabel='X-Axis')
2 Create Plot >>>
>>>
plt.plot(x,y,linewidth=4.0)
plt.plot(x,y,ls='solid')
>>> ax.legend(loc='best')
Ticks
No overlapping plot elements

>>> import matplotlib.pyplot as plt >>> ax.xaxis.set(ticks=range(1,5), Manually set x-ticks


>>> plt.plot(x,y,ls='--') ticklabels=[3,100,-12,"foo"])
Figure >>> plt.plot(x,y,'--',x**2,y**2,'-.') >>> ax.tick_params(axis='y', Make y-ticks longer and go in and out
>>> plt.setp(lines,color='r',linewidth=4.0) direction='inout',
>>> fig = plt.figure() length=10)
>>> fig2 = plt.figure(figsize=plt.figaspect(2.0)) Text & Annotations
Subplot Spacing
Axes >>> ax.text(1, >>> fig3.subplots_adjust(wspace=0.5, Adjust the spacing between subplots
-2.1, hspace=0.3,
All plotting is done with respect to an Axes. In most cases, a 'Example Graph', left=0.125,
style='italic') right=0.9,
subplot will fit your needs. A subplot is an axes on a grid system. >>> ax.annotate("Sine", top=0.9,
>>> fig.add_axes() xy=(8, 0), bottom=0.1)
>>> ax1 = fig.add_subplot(221) # row-col-num xycoords='data', >>> fig.tight_layout() Fit subplot(s) in to the figure area
xytext=(10.5, 0),
>>> ax3 = fig.add_subplot(212) textcoords='data', Axis Spines
>>> fig3, axes = plt.subplots(nrows=2,ncols=2) arrowprops=dict(arrowstyle="->", >>> ax1.spines['top'].set_visible(False) Make the top axis line for a plot invisible
>>> fig4, axes2 = plt.subplots(ncols=3) connectionstyle="arc3"),) >>> ax1.spines['bottom'].set_position(('outward',10)) Move the bottom axis line outward

3 Plotting Routines 5 Save Plot


1D Data Vector Fields Save figures
>>> plt.savefig('foo.png')
>>> fig, ax = plt.subplots() >>> axes[0,1].arrow(0,0,0.5,0.5) Add an arrow to the axes
>>> lines = ax.plot(x,y) Draw points with lines or markers connecting them >>> axes[1,1].quiver(y,z) Plot a 2D field of arrows Save transparent figures
>>> ax.scatter(x,y) Draw unconnected points, scaled or colored >>> axes[0,1].streamplot(X,Y,U,V) Plot a 2D field of arrows >>> plt.savefig('foo.png', transparent=True)
>>> axes[0,0].bar([1,2,3],[3,4,5]) Plot vertical rectangles (constant width)
>>>
>>>
>>>
axes[1,0].barh([0.5,1,2.5],[0,1,2])
axes[1,1].axhline(0.45)
axes[0,1].axvline(0.65)
Plot horiontal rectangles (constant height)
Draw a horizontal line across axes
Draw a vertical line across axes
Data Distributions
>>> ax1.hist(y) Plot a histogram
6 Show Plot
>>> plt.show()
>>> ax.fill(x,y,color='blue') Draw filled polygons >>> ax3.boxplot(y) Make a box and whisker plot
>>> ax.fill_between(x,y,color='yellow') Fill between y-values and 0 >>> ax3.violinplot(z) Make a violin plot
2D Data or Images Close & Clear
>>> fig, ax = plt.subplots() >>> plt.cla() Clear an axis
>>> axes2[0].pcolor(data2) Pseudocolor plot of 2D array >>> plt.clf() Clear the entire figure
>>> im = ax.imshow(img, Colormapped or RGB arrays >>> axes2[0].pcolormesh(data) Pseudocolor plot of 2D array
cmap='gist_earth', >>> plt.close() Close a window
interpolation='nearest', >>> CS = plt.contour(Y,X,U) Plot contours
vmin=-2, >>> axes2[2].contourf(data1) Plot filled contours
vmax=2) >>> axes2[2]= ax.clabel(CS) Label a contour plot DataCamp
Learn Python for Data Science Interactively
Matplotlib 2.0.0 - Updated on: 02/2017
Python For Data Science Cheat Sheet 3 Plotting With Seaborn
Seaborn Axis Grids
Learn Data Science Interactively at www.DataCamp.com >>> g = sns.FacetGrid(titanic, Subplot grid for plotting conditional >>> h = sns.PairGrid(iris) Subplot grid for plotting pairwise
col="survived", relationships >>> h = h.map(plt.scatter) relationships
row="sex") >>> sns.pairplot(iris) Plot pairwise bivariate distributions
>>> g = g.map(plt.hist,"age") >>> i = sns.JointGrid(x="x", Grid for bivariate plot with marginal
>>> sns.factorplot(x="pclass", Draw a categorical plot onto a y="y", univariate plots
y="survived", Facetgrid data=data)
Statistical Data Visualization With Seaborn hue="sex",
data=titanic)
>>> i = i.plot(sns.regplot,
sns.distplot)
The Python visualization library Seaborn is based on >>> sns.lmplot(x="sepal_width", Plot data and regression model fits >>> sns.jointplot("sepal_length", Plot bivariate distribution
y="sepal_length", across a FacetGrid "sepal_width",
matplotlib and provides a high-level interface for drawing hue="species", data=iris,
attractive statistical graphics. data=iris) kind='kde')

Categorical Plots Regression Plots


Make use of the following aliases to import the libraries: >>> sns.regplot(x="sepal_width", Plot data and a linear regression
Scatterplot
>>> import matplotlib.pyplot as plt y="sepal_length", model fit
>>> sns.stripplot(x="species", Scatterplot with one
>>> import seaborn as sns data=iris,
y="petal_length", categorical variable
data=iris) ax=ax)
The basic steps to creating plots with Seaborn are: >>> sns.swarmplot(x="species", Categorical scatterplot with Distribution Plots
y="petal_length", non-overlapping points
1. Prepare some data data=iris) >>> plot = sns.distplot(data.y, Plot univariate distribution
2. Control figure aesthetics Bar Chart kde=False,
color="b")
3. Plot with Seaborn >>> sns.barplot(x="sex", Show point estimates and
y="survived", confidence intervals with Matrix Plots
4. Further customize your plot hue="class", scatterplot glyphs
>>> sns.heatmap(uniform_data,vmin=0,vmax=1) Heatmap
data=titanic)
>>> import matplotlib.pyplot as plt Count Plot
>>>
>>>
>>>
import seaborn as sns
tips = sns.load_dataset("tips")
sns.set_style("whitegrid") Step 2
Step 1
>>> sns.countplot(x="deck",
data=titanic,
Show count of observations
4 Further Customizations Also see Matplotlib
palette="Greens_d")
>>> g = sns.lmplot(x="tip", Step 3
Point Plot Axisgrid Objects
y="total_bill",
data=tips, >>> sns.pointplot(x="class", Show point estimates and >>> g.despine(left=True) Remove left spine
aspect=2) y="survived", confidence intervals as >>> g.set_ylabels("Survived") Set the labels of the y-axis
>>> g = (g.set_axis_labels("Tip","Total bill(USD)"). hue="sex", rectangular bars >>> g.set_xticklabels(rotation=45) Set the tick labels for x
set(xlim=(0,10),ylim=(0,100))) data=titanic, >>> g.set_axis_labels("Survived", Set the axis labels
Step 4 palette={"male":"g", "Sex")
>>> plt.title("title")
>>> plt.show(g) Step 5 "female":"m"}, >>> h.set(xlim=(0,5), Set the limit and ticks of the
markers=["^","o"], ylim=(0,5), x-and y-axis
linestyles=["-","--"]) xticks=[0,2.5,5],

1
Boxplot yticks=[0,2.5,5])
Data Also see Lists, NumPy & Pandas >>> sns.boxplot(x="alive", Boxplot
Plot
y="age",
>>> import pandas as pd hue="adult_male",
>>> import numpy as np >>> plt.title("A Title") Add plot title
data=titanic)
>>> uniform_data = np.random.rand(10, 12) >>> plt.ylabel("Survived") Adjust the label of the y-axis
>>> sns.boxplot(data=iris,orient="h") Boxplot with wide-form data
>>> data = pd.DataFrame({'x':np.arange(1,101), >>> plt.xlabel("Sex") Adjust the label of the x-axis
'y':np.random.normal(0,4,100)}) Violinplot >>> plt.ylim(0,100) Adjust the limits of the y-axis
>>> sns.violinplot(x="age", Violin plot >>> plt.xlim(0,10) Adjust the limits of the x-axis
Seaborn also offers built-in data sets: y="sex", >>> plt.setp(ax,yticks=[0,5]) Adjust a plot property
>>> titanic = sns.load_dataset("titanic") hue="survived", >>> plt.tight_layout() Adjust subplot params
>>> iris = sns.load_dataset("iris") data=titanic)

2 Figure Aesthetics Also see Matplotlib


5 Show or Save Plot Also see Matplotlib
>>> plt.show() Show the plot
Context Functions >>> plt.savefig("foo.png") Save the plot as a figure
>>> f, ax = plt.subplots(figsize=(5,6)) Create a figure and one subplot >>> plt.savefig("foo.png", Save transparent figure
>>> sns.set_context("talk") Set context to "talk" transparent=True)
>>> sns.set_context("notebook", Set context to "notebook",
Seaborn styles font_scale=1.5, Scale font elements and
>>> sns.set() (Re)set the seaborn default
rc={"lines.linewidth":2.5}) override param mapping Close & Clear Also see Matplotlib
>>> sns.set_style("whitegrid") Set the matplotlib parameters Color Palette >>> plt.cla() Clear an axis
>>> sns.set_style("ticks", Set the matplotlib parameters >>> plt.clf() Clear an entire figure
{"xtick.major.size":8, >>> sns.set_palette("husl",3) Define the color palette >>> plt.close() Close a window
"ytick.major.size":8}) >>> sns.color_palette("husl") Use with with to temporarily set palette
>>> sns.axes_style("whitegrid") Return a dict of params or use with >>> flatui = ["#9b59b6","#3498db","#95a5a6","#e74c3c","#34495e","#2ecc71"]
with to temporarily set the style >>> sns.set_palette(flatui) Set your own color palette DataCamp
Learn Python for Data Science Interactively
Python For Data Science Cheat Sheet 3 Renderers & Visual Customizations
Bokeh Glyphs Grid Layout
Learn Bokeh Interactively at www.DataCamp.com, Scatter Markers >>> from bokeh.layouts import gridplot
taught by Bryan Van de Ven, core contributor >>> p1.circle(np.array([1,2,3]), np.array([3,2,1]), >>> row1 = [p1,p2]
fill_color='white') >>> row2 = [p3]
>>> p2.square(np.array([1.5,3.5,5.5]), [1,4,3], >>> layout = gridplot([[p1,p2],[p3]])
color='blue', size=1)
Plotting With Bokeh Line Glyphs Tabbed Layout
>>> p1.line([1,2,3,4], [3,4,5,6], line_width=2)
>>> p2.multi_line(pd.DataFrame([[1,2,3],[5,6,7]]), >>> from bokeh.models.widgets import Panel, Tabs
The Python interactive visualization library Bokeh >>> tab1 = Panel(child=p1, title="tab1")
pd.DataFrame([[3,4,5],[3,2,1]]),
enables high-performance visual presentation of color="blue") >>> tab2 = Panel(child=p2, title="tab2")
>>> layout = Tabs(tabs=[tab1, tab2])
large datasets in modern web browsers.
Customized Glyphs Also see Data
Linked Plots
Bokeh’s mid-level general purpose bokeh.plotting Selection and Non-Selection Glyphs
>>> p = figure(tools='box_select') Linked Axes
interface is centered around two main components: data >>> p.circle('mpg', 'cyl', source=cds_df, >>> p2.x_range = p1.x_range
and glyphs. selection_color='red', >>> p2.y_range = p1.y_range
nonselection_alpha=0.1) Linked Brushing
>>> p4 = figure(plot_width = 100,
+ = Hover Glyphs tools='box_select,lasso_select')
>>> from bokeh.models import HoverTool
>>> p4.circle('mpg', 'cyl', source=cds_df)
data glyphs plot >>> hover = HoverTool(tooltips=None, mode='vline')
>>> p5 = figure(plot_width = 200,
>>> p3.add_tools(hover)
tools='box_select,lasso_select')
The basic steps to creating plots with the bokeh.plotting >>> p5.circle('mpg', 'hp', source=cds_df)
interface are: US
Colormapping >>> layout = row(p4,p5)
1. Prepare some data: >>> from bokeh.models import CategoricalColorMapper
Asia
Europe

Python lists, NumPy arrays, Pandas DataFrames and other sequences of values
2. Create a new plot
>>> color_mapper = CategoricalColorMapper(
factors=['US', 'Asia', 'Europe'],
palette=['blue', 'red', 'green'])
4 Output & Export
3. Add renderers for your data, with visual customizations >>> p3.circle('mpg', 'cyl', source=cds_df, Notebook
color=dict(field='origin',
4. Specify where to generate the output transform=color_mapper), >>> from bokeh.io import output_notebook, show
5. Show or save the results legend='Origin') >>> output_notebook()
>>> from bokeh.plotting import figure
>>> from bokeh.io import output_file, show Legend Location HTML
>>> x = [1, 2, 3, 4, 5] Step 1
>>> y = [6, 7, 2, 4, 5] Inside Plot Area Standalone HTML
>>> p = figure(title="simple line example", Step 2 >>> p.legend.location = 'bottom_left' >>> from bokeh.embed import file_html
>>> from bokeh.resources import CDN
x_axis_label='x',
>>> html = file_html(p, CDN, "my_plot")
y_axis_label='y') Outside Plot Area
>>> p.line(x, y, legend="Temp.", line_width=2) Step 3 >>> from bokeh.models import Legend
>>> r1 = p2.asterisk(np.array([1,2,3]), np.array([3,2,1]) >>> from bokeh.io import output_file, show
>>> output_file("lines.html") Step 4 >>> r2 = p2.line([1,2,3,4], [3,4,5,6]) >>> output_file('my_bar_chart.html', mode='cdn')
>>> show(p) Step 5 >>> legend = Legend(items=[("One" ,[p1, r1]),("Two",[r2])],
location=(0, -30)) Components
1 Data Also see Lists, NumPy & Pandas
>>> p.add_layout(legend, 'right')

Legend Orientation
>>> from bokeh.embed import components
>>> script, div = components(p)
Under the hood, your data is converted to Column Data
Sources. You can also do this manually: >>> p.legend.orientation = "horizontal" PNG
>>> import numpy as np >>> p.legend.orientation = "vertical"
>>> from bokeh.io import export_png
>>> import pandas as pd >>> export_png(p, filename="plot.png")
>>> df = pd.DataFrame(np.array([[33.9,4,65, 'US'], Legend Background & Border
[32.4,4,66, 'Asia'],
[21.4,4,109, 'Europe']]), >>> p.legend.border_line_color = "navy" SVG
columns=['mpg','cyl', 'hp', 'origin'], >>> p.legend.background_fill_color = "white"
index=['Toyota', 'Fiat', 'Volvo']) >>> from bokeh.io import export_svgs
>>> from bokeh.models import ColumnDataSource Rows & Columns Layout >>> p.output_backend = "svg"
>>> export_svgs(p, filename="plot.svg")
>>> cds_df = ColumnDataSource(df) Rows
>>> from bokeh.layouts import row

2 Plotting >>> layout = row(p1,p2,p3)


Columns
5 Show or Save Your Plots
>>> from bokeh.plotting import figure >>> from bokeh.layouts import columns >>> show(p1) >>> show(layout)
>>> p1 = figure(plot_width=300, tools='pan,box_zoom') >>> layout = column(p1,p2,p3) >>> save(p1) >>> save(layout)
>>> p2 = figure(plot_width=300, plot_height=300, Nesting Rows & Columns
x_range=(0, 8), y_range=(0, 8)) >>>layout = row(column(p1,p2), p3) DataCamp
>>> p3 = figure() Learn Python for Data Science Interactively
(?<nam > )


e x create a named capture (?< i s > d)(?
f r t \ Match : 1325
 2

\ D

match a non-digi t

\D

The 5 cats ate
 52
 grou p

<sc nd> d) d*


ro \ \ first : 1
 hell o


Example 

12 Angry men

Example 

10032

Example 
 Example 
 Example 



: 3
Example 


non-matches -
Syntax Description Syntax Description second
pattern matches pattern matches non matches

\ w

match word character s

\wee\w

s

tree The bee
 ( |y)


x match several ( | a)


re b re d
 rant


bee4
eels eat meat
alternative pattern s
banter
bear

Regular Expressions Cheat Sheet


\ W

match non-word \Wbat\W

At bat
 wombat
 \n



reference previous ( )( w*) 1




b \ \ blob
 bea r

character s
Swing the bat fast
bat 53
captures where n is the bribe


bring



1


Learn regular expressions online at www DataCamp com . . group index starting at

\ s

match whitespace

\sfox\s

the fox ate
 ’
it s the fox .

his fox ran
foxfur

\k<nam > e reference named (?< i s >5)


f r t 51245
 523

\ S

match non-whitespace

\See\S

trees
 the bee stun g
 captures ( d*) k< i s >
\ \ f r t 55 51
beef
The tall tre e

What is a regular expression?

Regular expression (regex or regexp) is a pattern of characters that describes an amount of text. To process
\metacharacter escape a metacharacter

to match on the
\.

\^
The cat ate

2^3
.
 the cat ate


23
> Lookahead
regexes, you will use a “regex engine.” Each of these engines use slightly different syntax called regex metacharacter
You can specify that specific characters must appear before or after you match, without including those
flavor. A list of popular engines can be found here. Two common programming languages we discuss on
characters in the match.
DataCamp are Python and R which each have their own engines.

Character
Example 
 Example 
 Example 

Since regex describes patterns of text, it can be used to check for the existence of patterns in a text,
> classes
Syntax Description
pattern matches -
non matches

extract substrings from longer strings, and help make adjustments to text. Regex can be very simple to

describe specific words, or it can be more advanced to find vague patterns of characters like the top-level (?= )



x looks ahead at the next an(?=an)
 banan a
 ban d

domain in a url.
Character classes are sets or ranges of characters.
characters without using iss(?=ipp)


Mississippi


missed



them in the match


Example 
 Example 
 Example 



Syntax Description
pattern matches non matches -

[x y]

match several character s

gr[ea]y

gray
 green
 (?! )


x looks ahead at next ai(?!n)

fail
 faint

> Definitions grey
gree k
characters to not match brail

trai n


on

Literal character: A literal character is the most basic regular expression you can use. It simply matches
[x -y]

match a range of [a-e]

am ber
 fo x

the actual character you write. So if you are trying to represent an “r,” you would write r.
character s
brand
join
(?<= )



x looks at previous
(?<= )a





tr
trail
 bea r

characters for a match translat e


streak



Metacharacter: Metacharacters signify to the regex engine that the following character has a special
[^x y]

does not match several gr[^ea]y

green
 gray
 without using those in

meaning. You typically include a \ in front of the metacharacter and they can do things like signify the
character s
greek
grey

the match

beginning of a line, end of a line, or to match any single character.

[\^ -] match metacharacters 4[\^\.-+*/]\d

4^3
 44
 (?<! ) x looks at previous (?! )a


tr bea r
 trail

Character class: A character class (or character set) tells the engine to look for one of a list of characters.

It is signified by [ and ] with the characters you are looking for in the middle of the brackets.

inside the character 4.2 23 characters to not match translate strained

on
class

Capture group: A capture group is signified by opening and closing, round parenthesis. They allow you to

group regexes together to apply other regex features like quantifiers (see below) to the group.

> R epetition
> Literal matches and modifiers
Modifiers are settings that change the way the matching rules work.

> Anchors Rather than matching single instances of characters, you can match repeated characters.

Example 
 Example 
 Example 


Example 
 Example 
 Example 



Syntax Description
pattern matches -
non matches

Anchors match a position before or after other characters.


Syntax Description
pattern matches non matches -
\Qx\E

match start to finish

\Qtell\E
 tell
 ’
I ll tell you this


Example 
 Example 
 Example 
 x *



match z ero or more ar*o

caca o
 arugul a
 \Q\ d\E
\d
I have 5 coins

Syntax Description
pattern matches -
non matches
times
carrot
artichok e

^

match start of line

^r

rabbit
 parrot

(?i) (?-i).


x set the regex string to (?i) (?-i)


te sTep
 Trench


raccoon
ferret

x +

match one or more time s

re +

green 
 trap
 case-insensitiv e
tEach
bear

tre e
ruine d

$

match end of line

t$

rabbit
 trap

(? ) (?- )


x x x regex ignores (? ) a p(?- )


x t x tap
 c a t


foot
star

x ?

match z ero or one time s

ro ?a

roast
 root
 whitespac e
tapdanc e
rot a potato

rant
rear

\A

match start of line

\Ar

rabbit
 parrot

(?s) (?-s)






x turns on single-line/ (?s) i s and


f r t first an d
 first an d

raccoon
ferret

x {m}

match m time s

\we{2}\w

dee r
 re d
 DOTALL mode which s c nd(?-s)
e o Secon d an d third




 second 

see r
enter
makes the “.” include and hi d




t r and third




new-line symbols (\n) in


\Z

match end of line

t\Z

rabbit
 trap

x {m,}

match m or more time s

2{3,}4

671 2224

- 224
 addition to everything

foot
star

2222224
123
else

(?m) (?-m) ^ and $ to be at and d


\b


match characters at the \bfox\b


the red fox ran
 foxtrot

x {m,n}

match between m and n 12{1,3}3

1234
 15335
 x changes ^e eat an sleep
 treat and

sleep$
start or end of a word

the fox ate

foxskin scarf


times
1222384
1222223
end of line rather than eat an d
 sleep


end of string sleep eat and sleep.

\B match characters in the \Bee\B trees
 bee



x *?, x +?, 
 match the minimum re +? tree
 trout


de
middle of other non- beef tree
et c. number of times - known freeeee roasted

space characters
as a la z
y quantifier
> Unico

Capturing, alternation &b k


Regular expressions can work beyond the Roman alphabet, with things like Chinese characters or emoji.

> Matching types of character > ac references


Co de Points: The hexadecimal number used to represent an abstract character in a system like unicode.

Graphemes: Is either a codepoint or a character. All characters are made up of one or more graphemes
In order to extract specific parts of a string, you can capture those parts, and even name the parts that you
Rather than matching specific characters, you can match specific types of characters such as letters,
in a sequence.
captured.
numbers, and more.

Example 
 Example 
 Example 
 Example 
 Example 
 Example 



Example 
 Example 
 Example 

Syntax Description
pattern matches -
non matches
Syntax Description
pattern matches non matches - Syntax Description
pattern matches -
non matches

.

anything except for a c.e

clean
 acert

( )


x capturing a patter n

(iss)+

Mississipp i
 mist
 \X

match grapheme s

\u0000 gmail

@gmail
 gmail


linebrea k
chea p
cent

misse d
persist
www.email@gmail
@aol

\ d

match a digi t

\d

6060-842
 tw o
 (?: )

x create a group without (?:a )(cd)


b Match: abcd
 acbd



\X\X match special characters \u00e8 or è e

2b|^2b
**___
capturin g
Group 1: cd
like ones with an accent \u006 5\u0 300

\ D

match a non-digi t

\D

The 5 cats ate
 52
 (?<nam > )


e x create a named capture (?< i s > d)(?
f r t \ Match : 1325
 2

12 Angry men
10032
p

<sc nd> d) d*

: 1
 o


. .
grou ro \ \ first hell

second : 3
Learn Data Skills Online at www DataCamp com

\ w

match word character s
\wee\w

s

tree The bee


bee4
eels eat meat

( |y)


x match several ( | a)


re b re d
 rant


match non-word
alternative pattern s
banter
bear

> Linear Algebra Also see NumPy

Python For Data Science


You’ll use the linalg and sparse modules.

Note that scipy.linalg contains and expands on numpy.linalg.


Matrix Functions

SciPy Cheat Sheet


Addition
>>> from scipy import linalg, sparse
>>> np.add(A,D) #Addition

Creating Matrices Subtraction


Learn SciPy online at www.DataCamp.com >>> np.subtract(A,D) #Subtraction

>>> A = np.matrix(np.random.random((2,2)))
Division
>>> B = np.asmatrix(b)
>>> np.divide(A,D) #Division
>>> C = np.mat(np.random.random((10,5)))

>>> D = np.mat([[3,4], [5,6]]) Multiplication


>>> np.multiply(D,A) #Multiplication

SciPy Basic Matrix Routines


>>>
>>>
>>>
np.dot(A,D) #Dot product

np.vdot(A,D) #Vector dot product

np.inner(A,D) #Inner product

Inverse >>> np.outer(A,D) #Outer product

The SciPy library is one of the core packages for


>>> np.tensordot(A,D) #Tensor dot product

>>> A.I #Inverse


>>> np.kron(A,D) #Kronecker product
scientific computing that provides mathematical
>>> linalg.inv(A) #Inverse

algorithms and convenience functions built on the


>>> A.T #Tranpose matrix
Exponential Functions
>>> A.H #Conjugate transposition
>>> linalg.expm(A) #Matrix exponential

NumPy extension of Python. >>> np.trace(A) #Trace >>> linalg.expm2(A) #Matrix exponential (Taylor Series)

Norm >>> linalg.expm3(D) #Matrix exponential (eigenvalue decomposition)

>>> linalg.norm(A) #Frobenius norm


Logarithm Function

> Interacting With NumPy Also see NumPy >>> linalg.norm(A,1) #L1 norm (max column sum)

>>> linalg.norm(A,np.inf) #L inf norm (max row sum)


>>> linalg.logm(A) #Matrix logarithm
Trigonometric Functions
Rank >>> linalg.sinm(D) Matrix sine

>>> import numpy as np

>>> a = np.array([1,2,3])
>>> np.linalg.matrix_rank(C) #Matrix rank >>> linalg.cosm(D) Matrix cosine

>>> b = np.array([(1+5j,2j,3j), (4j,5j,6j)])


>>> linalg.tanm(A) Matrix tangent
Determinant
>>> c = np.array([[(1.5,2,3), (4,5,6)], [(3,2,1), (4,5,6)]]) Hyperbolic Trigonometric Functions
>>> linalg.det(A) #Determinant
>>> linalg.sinhm(D) #Hypberbolic matrix sine

Solving linear problems


Index Tricks >>> linalg.solve(A,b) #Solver for dense matrices

>>> linalg.coshm(D) #Hyperbolic matrix cosine

>>> linalg.tanhm(A) #Hyperbolic matrix tangent


>>> E = np.mat(a).T #Solver for dense matrices

>>> np.mgrid[0:5,0:5] #Create a dense meshgrid


>>> linalg.lstsq(D,E) #Least-squares solution to linear matrix equation Matrix Sign Function
>>> np.ogrid[0:2,0:2] #Create an open meshgrid
>>> np.sigm(A) #Matrix sign function
>>> np.r_[[3,[0]*5,-1:1:10j] #Stack arrays vertically (row-wise)
Generalized inverse
>>> np.c_[b,c] #Create stacked column-wise arrays >>> linalg.pinv(C) #Compute the pseudo-inverse of a matrix (least-squares solver)
Matrix Square Root
>>> linalg.pinv2(C) #Compute the pseudo-inverse of a matrix (SVD) >>> linalg.sqrtm(A) #Matrix square root

Shape Manipulation Arbitrary Functions


Creating Sparse Matrices >>> linalg.funm(A, lambda x: x*x) #Evaluate matrix function
>>> np.transpose(b) #Permute array dimensions

>>>
>>>
b.flatten() #Flatten the array

np.hstack((b,c)) #Stack arrays horizontally (column-wise)

>>>
>>>
F = np.eye(3, k=1) #Create a 2X2 identity matrix

G = np.mat(np.identity(2)) #Create a 2x2 identity matrix


Decompositions
>>> np.vstack((a,b)) #Stack arrays vertically (row-wise)
>>> C[C > 0.5] = 0

>>> np.hsplit(c,2) #Split the array horizontally at the 2nd index


>>> H = sparse.csr_matrix(C) #Compressed Sparse Row matrix
Eigenvalues and Eigenvectors
>>> np.vpslit(d,2) #Split the array vertically at the 2nd index >>> I = sparse.csc_matrix(D) #Compressed Sparse Column matrix
>>> la, v = linalg.eig(A) #Solve ordinary or generalized eigenvalue problem for square matrix

>>> J = sparse.dok_matrix(A) #Dictionary Of Keys matrix


>>> l1, l2 = la #Unpack eigenvalues

>>> E.todense() #Sparse matrix to full matrix

Polynomials >>> sparse.isspmatrix_csc(A) #Identify sparse matrix


>>>
>>>
v[:,0] #First eigenvector

v[:,1] #Second eigenvector

>>> linalg.eigvals(A) #Unpack eigenvalues


>>> from numpy import poly1d

>>> p = poly1d([3,4,5]) #Create a polynomial object Sparse Matrix Routines Singular Value Decomposition
>>> U,s,Vh = linalg.svd(B) #Singular Value Decomposition (SVD)

Inverse >>> M,N = B.shape

Vectorizing Functions >>> sparse.linalg.inv(I) #Inverse >>> Sig = linalg.diagsvd(s,M,N) #Construct sigma matrix in SVD

Norm LU Decomposition
>>> def myfunc(a): if a < 0:
>>> P,L,U = linalg.lu(C) #LU Decomposition
return a*2
>>> sparse.linalg.norm(I) #Norm
else:
Solving linear problems
return a/2

>>> sparse.linalg.spsolve(H,I) #Solver for sparse matrices


>>> np.vectorize(myfunc) #Vectorize functions

Type Handling Sparse Matrix Functions


>>> sparse.linalg.expm(I) #Sparse matrix exponential
>>> np.real(c) #Return the real part of the array elements

>>> np.imag(c) #Return the imaginary part of the array elements

>>>
>>>
np.real_if_close(c,tol=1000) #Return a real array if complex parts close to 0

np.cast['f'](np.pi) #Cast object to a data type Sparse Matrix Decompositions


>>> la, v = sparse.linalg.eigs(F,1) #Eigenvalues and eigenvectors

Other Useful Functions >>> sparse.linalg.svds(H, 2) #SVD

>>> np.angle(b,deg=True) #Return the angle of the complex argument

>>>
>>>
g = np.linspace(0,np.pi,num=5) #Create an array of evenly spaced values(number of samples)

g [3:] += np.pi
> Asking For Help Learn Data Skills Online at
>>>
>>>
np.unwrap(g) #Unwrap

np.logspace(0,10,3) #Create an array of evenly spaced values (log scale)

www.DataCamp.com
>>> np.select([c<4],[c*2]) #Return values from a list of arrays depending on conditions
>>> help(scipy.linalg.diagsvd)

>>> misc.factorial(a) #Factorial


>>> np.info(np.matrix)
>>> misc.comb(10,3,exact=True) #Combine N things taken at k time

>>> misc.central_diff_weights(3) #Weights for Np-point central derivative

>>> misc.derivative(myfunc,1.0) #Find the n-th derivative of a function at a point


3 Plotting With Seaborn
Regression Plots
Python For Data Science
Axis Grids
>>> g = sns.FacetGrid(titanic, #Subplot grid for plotting conditional relationships
>>> sns.regplot(x="sepal_width", #Plot data and a linear regression model fit

Seaborn Cheat Sheet col="survived",

row="sex")

>>> g = g.map(plt.hist,"age")

y="sepal_length",

data=iris,

ax=ax)
>>> sns.factorplot(x="pclass", #Draw a categorical plot onto a Facetgrid

y="survived",

Learn Seaborn online at www.DataCamp.com hue="sex",


Distribution Plots
data=titanic)

>>> sns.lmplot(x="sepal_width", #Plot data and regression model fits across a FacetGrid

y="sepal_length",
>>> plot = sns.distplot(data.y, #Plot univariate distribution

hue="species",
kde=False,

data=iris)
color="b")
>>> h = sns.PairGrid(iris) #Subplot grid for plotting pairwise relationships

Statistical Data Visualization With Seaborn >>> h = h.map(plt.scatter)

>>> sns.pairplot(iris) #Plot pairwise bivariate distributions


Matrix Plots
>>> i = sns.JointGrid(x="x", #Grid for bivariate plot with marginal univariate plots

y="y",

The Python visualization library Seaborn is based on matplotlib and provides data=data)

>>> sns.heatmap(uniform_data,vmin=0,vmax=1) #Heatmap

a high-level interface for drawing attractive statistical graphics. >>> i = i.plot(sns.regplot,

Make use of the following aliases to import the libraries:


sns.distplot)

>>> sns.jointplot("sepal_length", #Plot bivariate distribution

Categorical Plots
"sepal_width",

>>> import matplotlib.pyplot as plt


data=iris,
Scatterplot
>>> import seaborn as sns kind='kde')
>>> sns.stripplot(x="species", #Scatterplot with one categorical variable

y="petal_length",

The basic steps to creating plots with Seaborn are:


data=iris)

1. Prepare some data


>>> sns.swarmplot(x="species", #Categorical scatterplot with non-overlapping points

2. Control figure aesthetics

3. Plot with Seaborn

4 Further Customizations Also see Matplotlib


y="petal_length",

data=iris)

Bar Chart
4. Further customize your plot

Axisgrid Objects >>> sns.barplot(x="sex", #Show point estimates & confidence intervals with scatterplot glyphs

5. Show your plot y="survived",

hue="class",

>>> import matplotlib.pyplot as plt


>>> g.despine(left=True) #Remove left spine
data=titanic)
>>> import seaborn as sns
>>> g.set_ylabels("Survived") #Set the labels of the y-axis

>>> tips = sns.load_dataset("tips") #Step 1


>>> g.set_xticklabels(rotation=45) #Set the tick labels for x
Count Plot
>>> sns.set_style("whitegrid") #Step 2
>>> g.set_axis_labels("Survived", #Set the axis labels
>>> sns.countplot(x="deck", #Show count of observations

>>> g = sns.lmplot(x="tip", #Step 3


"Sex")
data=titanic,

y="total_bill",
>>> h.set(xlim=(0,5), #Set the limit and ticks of the x-and y-axis
palette="Greens_d")
data=tips,
ylim=(0,5),

aspect=2)
xticks=[0,2.5,5],
Point Plot
>>> g = (g.set_axis_labels("Tip","Total bill(USD)").
yticks=[0,2.5,5])
>>> sns.pointplot(x="class", #Show point estimates & confidence intervals as rectangular bars

set(xlim=(0,10),ylim=(0,100)))

y="survived",

>>> plt.title("title") #Step 4

>>> plt.show(g) #Step 5 Plot hue="sex",

data=titanic,

palette={"male":"g",

>>> plt.title("A Title") #Add plot title


"female":"m"},

>>> plt.ylabel("Survived") #Adjust the label of the y-axis


markers=["^","o"],

1 Data Also see Lists, NumPy & Pandas


>>>
>>>
>>>
plt.xlabel("Sex") #Adjust the label of the x-axis

plt.ylim(0,100) #Adjust the limits of the y-axis

plt.xlim(0,10) #Adjust the limits of the x-axis

Boxplot
linestyles=["-","--"])

>>> plt.setp(ax,yticks=[0,5]) #Adjust a plot property


>>> sns.boxplot(x="alive", #Boxplot

>>> import pandas as pd


>>> plt.tight_layout() #Adjust subplot params y="age",

>>> import numpy as np


hue="adult_male",

>>> uniform_data = np.random.rand(10, 12)


data=titanic)

>>> data = pd.DataFrame({'x':np.arange(1,101),


>>> sns.boxplot(data=iris,orient="h") #Boxplot with wide-form data
'y':np.random.normal(0,4,100)})
Violinplot
Seaborn also offers built-in data sets:
>>> sns.violinplot(x="age", #Violin plot

>>> titanic = sns.load_dataset("titanic")


y="sex",

>>> iris = sns.load_dataset("iris") hue="survived",

data=titanic)

2 Figure Aesthetics Also see Matplotlib 5 Show or Save Plot Also see Matplotlib

>>> plt.show() #Show the plot

>>> f, ax = plt.subplots(figsize=(5,6)) #Create a figure and one subplot


Context Functions >>> plt.savefig("foo.png") #Save the plot as a figure

>>> plt.savefig("foo.png", #Save transparent figure

Seaborn styles >>> sns.set_context("talk") #Set context to "talk"

transparent=True)

>>> sns.set_context("notebook", #Set context to "notebook",

>>> sns.set() #(Re)set the seaborn default

font_scale=1.5, #Scale font elements and

rc={"lines.linewidth":2.5}) #override param mapping

> Close & Clear


>>> sns.set_style("whitegrid") #Set the matplotlib parameters

>>> sns.set_style("ticks", #Set the matplotlib parameters


Also see Matplotlib
{"xtick.major.size":8,

"ytick.major.size":8})
Color Palette
#Return a dict of params or use with with to temporarily set the style
>>> plt.cla() #Clear an axis

>>> sns.axes_style("whitegrid") >>> sns.set_palette("husl",3) #Define the color palette


>>> plt.clf() #Clear an entire figure
Also see Matplotlib
>>> sns.color_palette("husl") #Use with with to temporarily set palette
>>> plt.close() #Close a window
>>> flatui = ["#9b59b6","#3498db","#95a5a6","#e74c3c","#34495e","#2ecc71"]

>>> sns.set_palette(flatui) #Set your own color palette

Learn Data Skills Online at www.DataCamp.com


Working with text
 > Formatting settings > Detecting Matches
data in Python # Generate an example DataFramed named df

df = pd.DataFrame({"x": [0.123, 4.567, 8.901]})

# Detect if a regex pattern is present in strings with .str.contains()

suits.str.contains("[ae]") # False True True True

# x
# Count the number of matches with .str.count()

# 0 0.123
suits.str.count("[ae]") # 0 1 2 2

# 1 4.567

Learn Python online at www.DataCamp.com # 2 8.901 # Locate the position of substrings with str.find()

suits.str.find("e") # -1 -1 1 4

# Visualize and format table output

df.style.format(precision = 1)

- x The output of style.format


> Extracting matches
Example data used throughout 0 0.1 is an HTML table
>
this cheat sheet 1 4.5 # Extract matches from strings with str.findall()

suits.str.findall(".[ae]") # [] ["ia"] ["he"[ ["pa", "de"]

2 8.9

Throughout this cheat sheet, we’ll be using two pandas series named suits and # Extract capture groups with .str.extractall()

rock_paper_scissors. suits.str.extractall("([ae])(.)")

# 0 1

import pandas as pd

Splitting strings
# match

suits = pd.Series(["clubs", "Diamonds", "hearts", "Spades"])

> # 1 0
# 2 0
a m

e a

rock_paper_scissors = pd.Series(["rock ", " paper", "scissors"]) # 3 0 a d

# Split strings into list of characters with .str.split(pat="")


# 1 e s

suits.str.split(pat="")


# Get subset of strings that match with x[x.str.contains()]

String lengths and substrings


# [, "c" "l" "u" "b" "s", ]
suits[suits.str.contains("d")] # "Diamonds" "Spades"

> # [, "D" "i" "a" "m" "o" "n" "d" "s", ]

# [, "h" "e" "a" "r" "t" "s", ]

# [, "S" "p" "a" "d" "e" "s", ]

# Get the number of characters with .str.len()

suits.str.len() # Returns 5 8 6 6

# Split strings by a separator with .str.split()

suits.str.split(pat = "a")


> Replacing matches


# Get substrings by position with .str[]

# Replace a regex match with another string with .str.replace()

suits.str[2:5] # Returns "ubs" "amo" "art" "ade"

# ["clubs"]

suits.str.replace("a", "4") # "clubs" "Di4monds" "he4rts" "Sp4des"

# ["Di", "monds"]

# Get substrings by negative position with .str[]


# ["he", "rts"]

# Remove a suffix with .str.removesuffix()

suits.str[:-3] # "cl" "Diamo" "hea" "Spa

# ["Sp", "des"]

suits.str.removesuffix # "club" "Diamond" "heart" "Spade"

# Remove whitespace from the start/end with .str.strip()


# Split strings and return DataFrame with .str.split(expand=True)
# Replace a substring with .str.slice_replace()

rock_paper_scissors.str.strip() # "rock" "paper" "scissors"

suits.str.split(pat = "a", expand=True)



rhymes = pd.Series(["vein", "gain", "deign"])

rhymes.str.slice_replace(0, 1, "r") # "rein" "rain" "reign"

# Pad strings to a given length with .str.pad()


# 0 1

suits.str.pad(8, fillchar="_") # "___clubs" "Diamonds" "__hearts" "__Spades" # 0 clubs None

# 1 Di monds

# 2 he rts

# 3 Sp des

> Changing case


# Convert to lowercase with .str.lower()
> Joining or concatenating strings Learn Python Online at
suits.str.lower() # "clubs" "diamonds" "hearts" "spades"

www.DataCamp.com
# Convert to uppercase with .str.upper()
# Combine two strings with +

suits.str.upper() # "CLUBS" "DIAMONDS" "HEARTS" "SPADES"

suits + "5" # "clubs5" "Diamonds5" "hearts5" "Spades5"

# Convert to title case with .str.title()


# Collapse character vector to string with .str.cat()

pd.Series("hello, world!").str.title() # "Hello, World!"

suits.str.cat(sep=", ") # "clubs, Diamonds, hearts, Spades"

# Convert to sentence case with .str.capitalize()


# Duplicate and concatenate strings with *

pd.Series("hello, world!").str.capitalize() # "Hello, world!" suits * 2 # "clubsclubs" "DiamondsDiamonds" "heartshearts" "SpadesSpades"


> Getting the current date and time > Arithmetic
# Get the current date
# Get the current date and time
# Create two datetimes

dt.date.today()

dt.datetime.now()
now = dt.datetime.now()

Working with Dates 
 then = pd.Timestamp('2021-09-15 10:03:30')

and Times in Python


> Reading date, datetime, and time columns in a CSV file # Get time elapsed as timedelta object

now - then

# Specify datetime column


# Get time elapsed in seconds

Learn Python Basics online at www.DataCamp.com pd.read_csv("filename.csv", parse_dates = ["col1", "col2"])

(now - then).total_seconds()

# Specify datetime column


# Adding a day to a datetime

pd.read_csv("filename.csv", parse_dates = {"col1": ["year", "month", "day"]})

dt.datetime(2022,8,5,11,13,50) + dt.timedelta(days=1)

Key
> Key definitions > Time Zones
When working with dates and times, you will encounter technical terms and
> Parsing dates, datetimes, and times # Get current time zone

jargon such as the following: tm.localtime().tm_zone

# Parse dates in ISO format

pd.to_datetime(iso)

# Get a list of all time zones

Date: Handles dates without time pytz.all_timezones

POSIXct: Handles date & time in calendar time # Parse dates in US format

POSIXlt: Handles date & time in local time pd.to_datetime(us, dayfirst=False)

# Parse strings to datetimes

Hms: Parses periods with hour, minute, and secon dttm = pd.to_datetime(iso)

Timestamp: Represents a single pandas date & tim # Parse dates in NON US format

pd.to_datetime(non_us, dayfirst=True)

# Get datetime with timezone using location

Interval: Defines an open or closed range between dates and time


dttm.dt.tz_localize('CET', ambiguous='infer')

Time delta: Computes time difference between different datetimes


# Parse dates, guessing a single format

pd.to_datetime(iso, infer_datetime_format=True)

# Get datetime with timezone using UTC offset

dttm.dt.tz_localize('+0100')

The
> The ISO8601 datetime format # Parse dates in single, specified format

pd.to_datetime(iso, format="%Y-%m-%d %H:%M:%S")

# Convert datetime from one timezone to another

dttm.dt.tz_localize('+0100').tz_convert('US/Central')

The ISO 8601 datetime format specifies datetimes from the largest to the # Parse dates in single, specified format

smallest unit of time (YYYY-MM-DD HH:MM:SS TZ). Some of the advantages of pd.to_datetime(us, format="%m/%d/%Y %H:%M:%S")

ISO 8601 are


# Make dates from components
> Time Intervals
pd.to_datetime(parts)

It avoids ambiguities between MM/DD/YYYY and DD/MM/YYYY formats # Create interval datetimes

The 4-digit year representation mitigates overflow problems after the year 2099 start_1 = pd.Timestamp('2021-10-21 03:02:10')

Using numeric month values (08 not AUG) makes it language independent, so
dates make sense throughout the world
> Extracting components finish_1 = pd.Timestamp('2022-09-15 10:03:30')

start_2 = pd.Timestamp('2022-08-21 03:02:10')

Python is optimized for this format since it makes comparison and sorting easier. finish_2 = pd.Timestamp('2022-12-15 10:03:30')

# Parse strings to datetimes

dttm = pd.to_datetime(iso)

# Specify the interval between two datetimes

> Packages used in this cheat sheet


pd.Interval(start_1, finish_1, closed='right')

# Get year from datetime pandas series

dttm.dt.year

# Get the length of an interval

pd.Interval(start_1, finish_1, closed='right').length

Load the packages and dataset used in this cheatsheet.

# Get day of the year from datetime pandas series

dttm.dt.day_of_year

# Determine if two intervals are intersecting

import datetime as dt
pd.Interval(start_1, finish_1,
import time as tm
# Get month name from datetime pandas series
closed='right').overlaps(pd.Interval(start_2, finish_2, closed='right'))

import pytz
dttm.dt.month_name()

import pandas as pd

In this cheat sheet, we will be using 3 pandas series — iso, us, non_us,
# Get day name from datetime pandas series

dttm.dt.day_name()

> Time Deltas


and 1 pandas DataFrame parts

# Get datetime.datetime format from datetime pandas series


# Define a timedelta in days

dttm.dt.to_pydatetime()
pd.Timedelta(7, "d")

iso us non_us # Convert timedelta to seconds

1969-07-20 20:17:40 07/20/1969 20:17:40 20/07/1969 20:17:40


> Rounding dates pd.Timedelta(7, "d").total_seconds()

1969-11-19 06:54:35 11/19/1969 06:54:35 19/11/1969 06:54:35


1971-02-05 09:18:11 02/05/1971 09:18:11 05/02/1971 09:18:11 # Rounding dates to nearest time unit

dttm.dt.round('1min')

parts
# Flooring dates to nearest time unit

Learn Data Skills Online at


year month day dttm.dt.floor('1min')

www.DataCamp.com
1969 7 20
# Ceiling dates to nearest time unit

1969 11 19 dttm.dt.ceil('1min')
1971 2 5

You might also like