Download as pdf or txt
Download as pdf or txt
You are on page 1of 9

Importing Data ¶

In [ ]: # To check the existing working directory

In [2]: import os

In [4]: os.getcwd()

Out[4]: 'C:\\Users\\rgandyala\\4 Data processing and Stats'

In [6]: import pandas as pd

In [ ]: # To change the working directory

In [6]: import os # OS module provides a way of using operating system dependent function
import pandas as pd
os.chdir("C:\\Users\\rgandyala\\4 Data processing and Stats")

In [5]: # Now place the data set in working directory and use below code to import the da

In [7]: csv1 = pd.read_csv("mba.csv")



#pd.read_CSV("C:\\Users\\rgandyala\\4 Data processing and Stats\\mba.csv")

In [8]: csv1

...

In [6]: # pd.read_csv is command use to import the data


Different ways of Importing csv


pd.read_csv("Iris.csv") is used to load our data into python

pd.read_csv("Iris.csv", skiprows=1) # Skips the first row

pd.read_csv("Iris.csv", header=1) # Skips header

pd.read_csv("Iris.csv", nrows=2) # Reading only first 2 rows

pd.read_csv("Iris.csv", na_values=["n.a.", "not available"]) # Telling what NA values are to python


pd.read_csv("Iris.csv",parse_dates=['day']) # As date column is taken as strin g we will to take as
Date dat type

In [7]: # if working directory is not set we can access the data directly from any folder

In [9]: import pandas as pd # data frame


import numpy as np # array , linear algebra ,Fourier Transfora,

In [10]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")

In [11]: # To view the data


data

...

In [26]: data


#filename['columnname']

...

In [47]: # number of Rows


len(data)

Out[47]: 10

In [48]: # check the number of columns


len(data.columns)

...

In [21]: # To read the column in data set


data["Country"]

...

In [27]: # Viewing Data



data
...
In [28]: ​
data.head() # Displays first 5 rows -

#data.head(4)

Out[28]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 NaN Yes

In [24]: data.tail() # Displays last 6 rows - we can mention the required row numbers

#data.tail(4)

Out[24]: Country Age Salary Purchased

5 France 35.0 58000.0 Yes

6 Spain NaN 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No

9 France 37.0 67000.0 Yes

In [26]: data.columns # Names of the columns


Out[26]: Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [28]: data.shape # Number of rows and columns

Out[28]: (10, 4)

In [30]: data.values # Displays values of data

...

In [32]: data.dtypes # Data Type of all columns

...

In [33]: data.info() #Information about dataset

...

Data Selection

Pandas has different data Access methods


As usual we use indexing operators"[]" and attribute operator "." for quick and easy access

.loc() which is for label based indexing - can search value based on values

.iloc() which is for integer based - can search value based on indexing

syntax - file[row,columns]

In [29]: data

Out[29]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 NaN Yes

5 France 35.0 58000.0 Yes

6 Spain NaN 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No

9 France 37.0 67000.0 Yes

In [34]: data.loc[0,"Salary"] #loc[row,columns]

...

In [35]: data.loc[data["Purchased"]=="Yes"]
...

In [30]: data

...

In [37]: X = data.iloc[:, :-1].values

In [38]: X

...

In [39]: # To identiy the missing values - True means we have missing values in Data
In [40]: data.isnull()

Out[40]: Country Age Salary Purchased

0 False False False False

1 False False False False

2 False False False False

3 False False False False

4 False False True False

5 False False False False

6 False True False False

7 False False False False

8 False False False False

9 False False False False

In [41]: data.isnull().any()#True -missing value is present, #Flase missing value is not p

...

In [42]: data.isnull().sum()#count of missing values in a column


...

In [50]: # To remove the columns we can us del or drop command



data

Out[50]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 NaN Yes

5 France 35.0 58000.0 Yes

6 Spain NaN 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No

9 France 37.0 67000.0 Yes

In [51]: del data['Country']


In [52]: data

...

In [53]: # To remove the Row from Data


data.drop(0)

Out[53]: Age Salary Purchased

1 27.0 48000.0 Yes

2 30.0 54000.0 No

3 38.0 61000.0 No

4 40.0 NaN Yes

5 35.0 58000.0 Yes

6 NaN 52000.0 No

7 48.0 79000.0 Yes

8 50.0 83000.0 No

9 37.0 67000.0 Yes

In [56]: # To Remove the columns



data.drop("Age",axis=1,inplace=True)#column

In [57]: data

...

In [58]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")

In [59]: # To replace the missing values we can user fillna command


In [60]: data

Out[60]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 NaN Yes

5 France 35.0 58000.0 Yes

6 Spain NaN 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No

9 France 37.0 67000.0 Yes

In [61]: data.fillna(12) # All the missing values will be replaced by 12

Out[61]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 12.0 Yes

5 France 35.0 58000.0 Yes

6 Spain 12.0 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No

9 France 37.0 67000.0 Yes


In [63]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")
data

Out[63]: Country Age Salary Purchased

0 France 44.0 72000.0 No

1 Spain 27.0 48000.0 Yes

2 Germany 30.0 54000.0 No

3 Spain 38.0 61000.0 No

4 Germany 40.0 NaN Yes

5 France 35.0 58000.0 Yes

6 Spain NaN 52000.0 No

7 France 48.0 79000.0 Yes

8 Germany 50.0 83000.0 No

9 France 37.0 67000.0 Yes

In [66]: # Drop rows that are complete missing all data



data.dropna()
...

In [68]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")


In [69]: # To Drop the columns of missing values

In [71]: #Drop columns with missing data


data.dropna(axis=1)

...

In [72]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")

In [73]: # To Replace the missing values with mean or median



data.fillna(data.mean(), inplace=True)

In [74]: data

...

In [75]: # we can also replace value with individual Columns


In [76]: data = pd.read_csv("D:\\Course\\Python\\Datasets\\Data.csv")

In [77]: data['Age'].fillna(data['Age'].mean(), inplace=True)

In [78]: data

...

In [79]: data['Salary'].fillna(data['Salary'].mean(), inplace=True)

In [80]: data

...

In [ ]: ​

You might also like