Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 28

¿¿

load hogg %Load the hogg dataset

hogg
diag(var(hogg))
%For hogg, covariance is diag variance

1
N ( ^x ; ^
μ , ∑ )= −1 T
( ^x − μ^ ) ( x−
^ ^μ )
d 2
(2 π )
∑=covariance ¿
√ 2
√ ¿ ∑∨¿ e ∑
¿

Maximum likelihood per class is


1
^μ= ∑ x^ j
m j
1 T
∑= ∑ ( ^x j− μ^ ) ( ^x j −^μ)
m j

load fisheriris
load fisheriris
data = [meas(:,1) meas(:,3)];

m1 = mean(data(101:150,:))
c1 = cov(data(101:150,:))
c1 = eye*diag(c1)'
[X1,X2] = meshgrid(linspace(0,8,50)', linspace(0,8,50)');
X = [X1(:) X2(:)];
p = mvnpdf(X,m1,c1);
figure
surf(X1,X2,reshape(p,50,50));

m2 = mean(data(51:100,:))
c2 = cov(data(51:100,:))
c2 = eye*diag(c2)'
hold on
p2 = mvnpdf(X,m2,c2);
surf(X1,X2,reshape(p2,50,50));

testx = [2.5 5]
a = mvnpdf(testx,m1,c1);
b = mvnpdf(testx,m2,c2)
a/(a+b)
b/(a+b)

k
p ( x ) =∑ p(x∨Gi )P(Gi )
i−1
=G i
x 1 + x 2+ …+ x n
ub =
nb
2
σ b=¿ ¿ ¿

z ti

clear all
MU1 = [1 2]; %Initialise MEAN1 at [x1 = 1, x2 = 2]
SIGMA1 = [2 0; 0 .5]; %Initialise covariance matrix 1
MU2 = [-3 -5]; %Initialise MEAN2 at [x1 = -3, x2 = -5]
SIGMA2 = [1 0; 0 1]; %Initalise covariance matrix 2
rng(1); %Seed the rng
X = [mvnrnd(MU1,SIGMA1,1000);
mvnrnd(MU2,SIGMA2,1000)]; %Create an array of MVGaussians
figure;
hold on
scatter(X(:,1),X(:,2),10,'.') %Plot data of multivariate Gaussians

options = statset('Display','final'); %Display the output


gm = fitgmdist(X,2,'Options',options); %Fit 2 GMM to the X scatter
gmPDF = @(x,y)pdf(gm,[x y]); %Create pdf with gm fits
h = ezcontour(gmPDF,[-8 6],[-8 6]); %Plot contours of gmPDF
hold off

ComponentMeans = gm.mu %Get the means of the fit


ComponentCovariances = gm.Sigma %Get the covariance matricies
MixtureProportions = gm.PComponents %Get mixture proportions (% data)

AIC = zeros(1,4); %Create empty array for AIC


gm = cell(1,4); %Create empty gaussian model
for k = 1:4
gm{k} = fitgmdist(X,k); %Fit 1,2..4 Seperate models
AIC(k)= gm{k}.AIC; %Get the AIC term for each model
end

[minAIC,numComponents] = min(AIC); %Get the numComponents for minAIC


numComponents; %Minimum components in 2
gm2 = gm{numComponents} %Get the 2 Component GMM
load fisheriris; % Load the fisheriris data
X = meas(:,1:2); % Let (x1 = column1) (x2 = column2)
[n,p] = size(X); % Size of the data
rng(3); % Seed random number generator

k = 3; %3 Components to the mixture model


%Covariance purely diagonal or full
Sigma = {'diagonal','full'};
nSigma = numel(Sigma);
%Covariance matrix is shared or unique
SharedCovariance = {true,false};
SCtext = {'true','false'};
nSC = numel(SharedCovariance);

d = 500; %Depth component of linspace


x1 = linspace(min(X(:,1)) - 2,max(X(:,1)) + 2,d); %X1 linspace
x2 = linspace(min(X(:,2)) - 2,max(X(:,2)) + 2,d); %X2 linspace
[x1grid,x2grid] = meshgrid(x1,x2); %Mesh from linspace

X0 = [x1grid(:) x2grid(:)]; %Create array defining space


threshold = sqrt(chi2inv(0.99,2)); %Inverse the distribution
iterations = 1000; %Number of EM iterations
options = statset('MaxIter',iterations); %Set iterations

c = 1;
%Iterate over all possible combinations of
%full/diagonal and shared/unique
for i = 1:nSigma
for j = 1:nSC
%Fit gaussian model to the data, with covariance options
gmfit = fitgmdist(X,k,'CovarianceType',Sigma{i},...
'SharedCovariance',SharedCovariance{j},'Options',options);

%Cluster based on the gaussian fits


clusterX = cluster(gmfit,X);
%Get the mahalalanobis distances for classification
mahalDist = mahal(gmfit,X0);
subplot(2,2,c);
%Plot the scatter of each cluster
h1 = gscatter(X(:,1),X(:,2),clusterX);
hold on;
%Iterate over each component
for m = 1:k
%If the mahal distance less than threshold, possess point
idx = mahalDist(:,m)<=threshold;
%Set colour to that of the component
Color = h1(m).Color*0.75 + -0.5*(h1(m).Color - 1);
h2 =
plot(X0(idx,1),X0(idx,2),'.','Color',Color,'MarkerSize',1);
uistack(h2,'bottom');
end
%Plot the mean of the gaussian component

plot(gmfit.mu(:,1),gmfit.mu(:,2),'kx','LineWidth',2,'MarkerSize',10)
%Add legend, title and move to next combination
title(sprintf('Sigma is %s, SharedCovariance = %s',...
Sigma{i},SCtext{j}),'FontSize',8)
legend(h1,{'1','2','3'});
hold off
c = c + 1;
end
end

%Initialise cluser in 4 different ways


cluster0 = {[ones(n-8,1); [2; 2; 2; 2]; [3; 3; 3; 3]];...
randsample(1:k,n,true); randsample(1:k,n,true); 'plus'};
%Create empty convergence array
converged = nan(4,1);
figure;
%Iterate over initial conditions
for j = 1:4
%Fit gaussian model using initialisation j
gmfit = fitgmdist(X,k,'CovarianceType','full',...
'SharedCovariance',false,'Start',cluster0{j},...
'Options',options);
%Find cluster and mahal distance
clusterX = cluster(gmfit,X);
mahalDist = mahal(gmfit,X0);
subplot(2,2,j);
%Assign points to clusters on mahal distance (see last example)
h1 = gscatter(X(:,1),X(:,2),clusterX);
hold on;
nK = numel(unique(clusterX));
for m = 1:nK
idx = mahalDist(:,m)<=threshold;
Color = h1(m).Color*0.75 + -0.5*(h1(m).Color - 1);
h2 = plot(X0(idx,1),X0(idx,2),'.','Color',Color,'MarkerSize',1);
uistack(h2,'bottom');
end
plot(gmfit.mu(:,1),gmfit.mu(:,2),'kx','LineWidth',2,'MarkerSize',10)
legend(h1,{'1','2','3'});
hold off
%Assign converged j as the iterations to convergence
converged(j) = gmfit.Converged;
end

N
1 x −x
f kernel ( x )= ∑
Nb i =1 ( )
K i
b

E ( { m }|X ) =∑ ∑ b ti∨| xt −m i|∨¿¿


t i

b ti= 1,∧mi is the closest mean x t ¿ 0 ,∧else ¿


{ ¿

√¿ ¿
| A x −B x|+ ¿ A y −B y ∨¿
% Dendrograms for the Iris dataset
load fisheriris
%Use average linkage
tree = linkage(meas,'average');
dendrogram(tree,0)
%Use single linkage
tree = linkage(meas,'single');
dendrogram(tree,0)
%Use complete linkage
tree = linkage(meas,'complete');
dendrogram(tree,0)

z=wT x

λ 1+ λ2+ …+ λ k
λ1 + λ2 +…+ λk +… λ d

E ( F ∪ x i ) < E(F )
j=arg ⁡¿

2 T T T 2
( m1−m2 ) =( w m1 −w m2 ) =w S B w
S B=( m 1−m 2 ) ( m 1−m 2 )T

2
s21=∑ ( w T xt −m 1 ) r t =wT S1 w
t
T
Sn=∑ ( x t−m n)( x t−m 1 ) r t
t

T 2

J ( w )=
|w ( m1−m2 )|
wT Sw w

w=c S−1
w (m 1−m 2 )

load fisheriris.mat %Load iris data


X = meas(1:100,1:2); %Limit data to feature in c1-c2

m1 = mean(X(1:50,:)) %Calculate mean of first dataset


m2 = mean(X(51:100,:)) %Calculate mean of second dataset

S1 = cov(X(1:50,:)) %Calculate covariance of first dataset


S2 = cov(X(51:100,:)) %Calculate covariance of first dataset
Sw = S1+S2 %Create S_W as combination

%The LDA solution


w = inv(Sw)*(m1-m2)' %Solve using LDA
wscaled = w/norm(w) %Include c term (scaled w)

plot(X(1:50,1),X(1:50,2),'rx') %Plot dataset 1


hold on
plot(X(51:100,1),X(51:100,2),'bx') %Plot dataset 2
line([m1(1), m1(1)+wscaled(1)],[m1(2), m1(2)+wscaled(2)]) %Plot LDA line

datatrain = load('hw2q2Training.csv'); %Load training data


datavalid = load('hw2q2Validation.csv'); %Load validation data
xtrain = datatrain(:,1); ytrain = datatrain(:,2); %Get training var
xvalid = datavalid(:,1); yvalid = datavalid(:,2); %Get validate var

figure()
plot(xtrain, ytrain, '*') %Plot training data

order_list = 0:30; %Compute for orders 0-30


sse_list = []; %Empty array for sse's
features = {}; %Empty array for features
coeffs = {}; %Empty array for coefficents
for order = order_list
features= cat(2, features, ['x^' num2str(order)]); %List of xPowers
coeffs = cat(2, coeffs, ['a' num2str(order)]); %List of coeff

poly = fittype(features, 'coefficients', coeffs); %Use coeff and


features
model = fit(xtrain, ytrain, poly); %Fit training data to model
ypred = feval(model, xvalid); %Get the predicted yvalues
sse = sum( (ypred - yvalid).^2); %Get sum of squared error
sse_list = [sse_list, sse]; %Add to the SSE list
end

plot(order_list, sse_list) %Plot the data

clear all; close all;


dataTrainRaw = load('BreastCancerTrain.csv'); %Load training data
dataValRaw = load('BreastCancerValidation.csv'); %Load validation data

THRESHOLD = 0.5; %Threshold considered a part of a class (50%)

trainData = dataTrainRaw(:,1:9); %Get training input features


trainLabels = dataTrainRaw(:,10); %Get training classification

posData = trainData(trainLabels == 1, :); %Seperate positive data


negData = trainData(trainLabels == 0, :); %Seperate negative data

posPrior = (size(posData))/(size(posData)+size(negData)); %Prior positive


posMu = mean(posData); %Mean of positive data features
posCov = cov(posData); %Covariance of positive data features
posPostNumTrain = mvnpdf(trainData, posMu, posCov)*posPrior; %+ve MVN dist

negPrior = 1 - posPrior; %Negative prior


negMu = mean(negData); %Mean of negative data features
negCov = cov(negData); %Covariance of negative data features
negPostNumTrain = mvnpdf(trainData, negMu, negCov)*negPrior; %-ve MVN dist
%Get the posteriors for the training data using Bayes rule
posPostTrain = (posPostNumTrain)./(negPostNumTrain+posPostNumTrain);
negPostTrain = (negPostNumTrain)./(negPostNumTrain+posPostNumTrain);

valData = dataValRaw(:,1:9); %Get validation input features


valLabels = dataValRaw(:,10); %Get validation classification

%Apply MVN dist to validation set


posPostNumVal = mvnpdf(valData, posMu, posCov)*posPrior;
negPostNumVal = mvnpdf(valData, negMu, negCov)*negPrior;

%Get the posteriors for the validation data using Bayes rule
postPosVal = (posPostNumVal)./(posPostNumVal+negPostNumVal);
postNegVal = (negPostNumVal)./(posPostNumVal+negPostNumVal);

%Let any value > THRESHOLD be considered +ve class (training)


posPredTrain = posPostTrain > THRESHOLD;
%Get the training error based on number correctly identified
trainError = (sum(posPredTrain == trainLabels))/length(trainLabels);
display((1- trainError)*100);

%Let any value > THRESHOLD be considered +ve class (validation)


posPredVal = postPosVal > THRESHOLD;
%Get the validation error based on number correctly identified
valError = (sum(posPredVal == valLabels))/length(valLabels);
display((1 - valError)*100);

clear all; close all; clc;


x = linspace(0,5,100); %Create linspace 0-5 (100 elements)
y = 2*sin(1.5*x); %True relationship

rng('default'); %For consistency


TrainingInput = datasample(x,20, 'Replace', false); %Rand train input
ValidationInput = datasample(x,20, 'Replace', false); %Rand val input

TrainingOutput = awgn(2*sin(1.5*TrainingInput),1); %Train Output w/ noise


ValidationOutput = 2*sin(1.5*ValidationInput); %Validation output
SSE = []; %Empty array for sse

%Plot the true relationship


figure
hold on
plot(x,y,":");

N = 8; %Maximum order tested


for n = 1:N
p = polyfit(TrainingInput, TrainingOutput, n); %Fit polynomial order n
predOutput = polyval(p, ValidationInput); %Get predicted output on val
SSE = [SSE sum((predOutput-ValidationOutput).^2)]; %Calc SSE
plot(x, polyval(p,x)); %Plot polynomial over x
end

%Plot SSE per order


figure();
plot(1:N, SSE);
clear all; close all; clc;

rawData = load('iris.csv'); %Load iris.csv data

labels = rawData(:,5); %Capture labels (col 5)


features = rawData(:,1); %Capture feature (col 1)

PRIOR = 0.5; %Assume identical priors

x = linspace(min(features)-4,max(features)+4,500); %Assign appro linspace

c1Data = features(labels == 1, :); %All entires relevant to class 1


c1Norm = fitdist(c1Data, 'normal'); %Fit normal dis to class 1
c1PDF = pdf(c1Norm, x); %Plot PDF accross normal

c2Data = features(labels == 2, :); %All entries relevant to class 2


c2Norm = fitdist(c2Data, 'normal'); %Fit normal dis to class 2
c2PDF = pdf(c2Norm, x); %Plot PDF accross normal

%Plot Normal PDF's


figure();
hold on;
plot(x, c1PDF);
plot(x, c2PDF);
hold off;

%Calculate posteriors using Bayes rule


c1PostNum = c1PDF*PRIOR;
c2PostNum = c2PDF*PRIOR;
c1Post = (c1PostNum)./(c1PostNum + c2PostNum); %Class 1 posterior
c2Post = (c2PostNum)./(c1PostNum + c2PostNum); %Class 2 Posterior

%Plot posteriors
figure();
hold on;
plot(x, c1Post);
plot(x, c2Post);
clear all; close all; clc;
rawRead = readtable('pima_indians_diabetes.csv');

rawRead(1,:) = [];
rawData = table2array(rawRead(:,1:8)); %Read raw features
rawLabels = string(table2array(rawRead(:,9))); %Read raw labels

rawLabels(rawLabels == 'pos') = 1; %Set pos label to 1 for comparison


rawLabels(rawLabels == 'neg') = 0; %Set neg label to 0 for comparison

rawLabels = double(rawLabels); %Convert strings to double values

trainingData = rawData(1:499, :); %Use data (1-499) for training


trainingLabels = rawLabels(1:499, :); %Use labels (1-499) for training

verifyData = rawData(500:end,:); %Use data (500-768) for verification


verifyLabels = rawLabels(500:end,:); %Use labels (500-758) for verification

posTrain = trainingData(trainingLabels == 1, :); %Source only +ve features


posTrainMu = mean(posTrain); %Calculate the mean of +ve features
posTrainCov = cov(posTrain); %Calculate covariance of +ve features
posTrainPrior = length(posTrain)/length(trainingData); %Calculate +ve prior
posTrainPostNum = mvnpdf(trainingData, posTrainMu, ...
posTrainCov)*posTrainPrior; %+ve MVN dist

negTrain = trainingData(trainingLabels == 0, :); %Source only -ve features


negTrainMu = mean(negTrain); %Calculate the mean of -ve features
negTrainCov = cov(negTrain); %Calculate covariance of -ve features
negTrainPrior = length(negTrain)/length(trainingData); %Calculate -ve prior
negTrainPostNum = mvnpdf(trainingData, negTrainMu, ...
negTrainCov)*negTrainPrior; %-ve MVN dist

%Calculate posterior probabilities for each feature being in class


posTrainPos = (posTrainPostNum)./(posTrainPostNum + negTrainPostNum);
negTrainPos = (negTrainPostNum)./(posTrainPostNum + negTrainPostNum);

%Take class with greater probability


predTrain = posTrainPos > negTrainPos;
%Calculate accuracy by comparing with training data
posTrainAccuracy = (sum(trainingLabels == predTrain))/(length(predTrain));
display(posTrainAccuracy);

%Apply +ve MVN to verification data


posVerifyPostNum = mvnpdf(verifyData, posTrainMu, ...
posTrainCov)*posTrainPrior; %+ve MVN dist

%Apply -ve MVN to verification data


negVerifyPostNum = mvnpdf(verifyData, negTrainMu, ...
negTrainCov)*negTrainPrior; %-ve MVN dist

%Calculate posterior probabilities for each feature being in class


posVerifyPos = (posVerifyPostNum)./(posVerifyPostNum + negVerifyPostNum);
negVerifyPos = (negVerifyPostNum)./(negVerifyPostNum + posVerifyPostNum);

%Calculate accuracy by comparing with verification data


predVerify = posVerifyPos > negVerifyPos;
posVerifyAccuracy = (sum(verifyLabels == predVerify))/(length(predVerify));
display(posVerifyAccuracy);

clear all; close all; clc;

rawData = load('iris.csv'); %Load iris data


feature = rawData(:,4); %Specify feature 4

%Create 2 bin histo subplot


figure();
subplot(2,2,1);
h2 = histogram(feature, 2);
title('Iris Feature 4 - 2 Bins');

%Create 5 bin histo subplot


subplot(2,2,2);
h10 = histogram(feature, 5);
title('Iris Feature 4 - 5 Bins');

%Create 10 bin histo subplot


subplot(2,2,3);
h50 = histogram(feature, 10);
title('Iris Feature 4 - 10 Bins');

%Create 50 bin histo subplot


subplot(2,2,4);
h100 = histogram(feature, 50);
title('Iris Feature 4 - 50 Bins');

clear all; close all; clc;

rawData = load('iris.csv'); %Load iris data


feature = rawData(:,4); %Specify feature 4

%Plot KDE for IRIS Feature 4 Bandwidth 0.3


subplot(4,1,1);
[f1, xi1,bw1] = ksdensity(feature, 'Bandwidth', 0.3);
plot(xi1, f1);
title('KDE IRIS Data - Bandwidth 0.3');

%Plot KDE for IRIS Feature 4 Bandwidth 0.4


subplot(4,1,2);
[f2, xi2,bw2] = ksdensity(feature, 'Bandwidth', 0.4);
plot(xi2, f2);
title('KDE IRIS Data - Bandwidth 0.4');

%Plot KDE for IRIS Feature 4 Bandwidth 0.5


subplot(4,1,3);
[f3, xi3,bw3] = ksdensity(feature, 'Bandwidth', 0.5);
plot(xi2, f2);
title('KDE IRIS Data - Bandwidth 0.5');
%Plot KDE for IRIS Feature 4 Bandwidth 0.6
subplot(4,1,4);
[f4, xi4,bw4] = ksdensity(feature, 'Bandwidth', 0.6);
plot(xi4, f4);
title('KDE IRIS Data - Bandwidth 0.6');
close all; clear all; clc;

heightWeightStruct = load('heightWeight.mat'); %Load the height/weight data


data = heightWeightStruct.heightWeightData(:, 2:3); %Access data component

K = 3; %Expected number of clusters


epsilon = 0.1; %Threshold movement before convergence
testCentroid = []; %Contains centroids of clusters

%ASSUME 2 Features (can be changed to include more)


for n = 1:2
%Add K random positions for centers
testCentroid = [testCentroid (max(data(:,n))...
-min(data(:,n))).*rand(K,1) + min(data(:,n))];
end

%Iterate over moving cluster means


while(true)
dists = []; %Distance
for c = 1:K
diff = data - testCentroid(c,:); %Diff between centroid and data
dists = [dists sqrt(diff(:,1).^2+diff(:,2).^2)]; %add euclid dist
end
[mValue,ind] = min(dists, [], 2); %Find mean closest to data points
movement = []; %Track movement in center
for c = 1:K %Iterate over clusters
newCenter = mean(data(ind==c, :)); %Find new center on means
mov = newCenter - testCentroid(c,:); %Movement of center
movement(c,:) = sqrt(mov(:,1).^2+mov(:,2).^2); %Add euclid dist
testCentroid(c, :) = newCenter; %Set new center
end
if(movement < epsilon) %Converged if center moved small distance
break;
end
end

[idx,C] = kmeans(data, K); %Verify with matlab kmeans function

close all; clear all; clc;

heightWeightStruct = load('heightWeight.mat'); %Load the height/weight data


data = heightWeightStruct.heightWeightData(:, 2:3); %Access data component

K = 3; %Expected number of clusters


epsilon = 0.1; %Threshold movement before convergence
testCentroid = []; %Contains centroids of clusters

%ASSUME 2 Features (can be changed to include more)


for n = 1:2
%Add K random positions for centers
testCentroid = [testCentroid (max(data(:,n))...
-min(data(:,n))).*rand(K,1) + min(data(:,n))];
end

%Iterate over moving cluster means


while(true)
dists = []; %Distance
for c = 1:K
diff = data - testCentroid(c,:); %Diff between centroid and data
dists = [dists sqrt(diff(:,1).^2+diff(:,2).^2)]; %add euclid dist
end
[mValue,ind] = min(dists, [], 2); %Find mean closest to data points
movement = []; %Track movement in center
for c = 1:K %Iterate over clusters
newCenter = mean(data(ind==c, :)); %Find new center on means
mov = newCenter - testCentroid(c,:); %Movement of center
movement(c,:) = sqrt(mov(:,1).^2+mov(:,2).^2); %Add euclid dist
testCentroid(c, :) = newCenter; %Set new center
end
if(movement < epsilon) %Converged if center moved small distance
break;
end
end

[idx,C] = kmeans(data, K); %Verify with matlab kmeans function

close all; clear all; clc;

NPCA = 2; %Number of desired Principal components


CLASSES = 10; %Number of classes (only for colouring)

dStruct = load('mnist_train.mat'); %Access the MNIST data struct


LABELS = dStruct.train_labels; %Access raw LABELS
DATA = dStruct.train_X; %Access raw FEATURE DATA

covMatrix = cov(DATA); %Calculate covariance of features


[eigVect, eigVal] = eig(covMatrix); %Calculate eig of cov matrix

[d, ind] = sort(diag(eigVal), 'descend'); %Get indexes to sort eig


eigValSorted = (diag(eigVal(ind,ind))); %Sort values descending
eigVectSorted = eigVect(:,ind); %Sort vectors descending

%Select first NPCA values and vectors


eigValSelect = reshape(eigValSorted(1:NPCA,:),[1,2]);
eigVectSelect = eigVectSorted(:,1:NPCA);

%Calculate the percentage of variance captured


varCaptured = sum(eigValSelect)/sum(eigValSorted)*100;
display(varCaptured);

reducedTrainX = DATA*eigVectSelect; %Project data onto PCA1 and PCA2 plane

%Plot the projected data


cmap = ['r','b','g','y','c','m','k','w',[0.4,0.6,0.7],[0.2,0.8,0.5]];
figure();
hold on;
xlabel('Principle component 1');
ylabel('Principal component 2');
title('MNIST Data summarised with 2 principle components');
%Iterate over classes and colour accordingly
for n = 1:CLASSES
scatter(reducedTrainX(LABELS == n,1), ...
reducedTrainX(LABELS == n,2), cmap(n));
end
hold off;

%Plot the Scree graph for values in descending order


figure();
plot(linspace(-1,800,784),eigValSorted, '-x');
xlabel('Value index');
ylabel('EigenValue');
title('Scree Graph');

%Calculate the portion of variance captured with increasing values included


varProp = []
for n = 1:length(eigValSorted)
varProp = [varProp sum(eigValSorted(1:n))/(sum(eigValSorted))];
end
figure();
plot(varProp);
xlabel('Number of eigenValues');
ylabel('Portion of variance captured');

close all; clear all; clc;


xf = [179, 168,169,183, 172]; %Female Data
xm = [201, 173, 154, 191]; %Male data

mf = mean(xf); %Max likelihood male mean


mm = mean(xm); %Max likelihood female mean
sf = sum((xf - mf).^2)/length(xf); %Max likelihood female variance
sm = sum((xm - mm).^2)/length(xm); %Max likelihood male variance

TESTPOINT = 210; %Test at x=210


x = linspace(0,300,500); %Plot over [0-300]

pdF = makedist('Normal', 'mu', mf, 'sigma', sf); %Create female normal dist
pdfF = pdf(pdF, TESTPOINT); %Get probability @ 210
plot(x,pdf(pdF,x)); %Plot female pdf
hold on

pdM = makedist('Normal', 'mu', mm, 'sigma', sm); %Create male normal dist
pdfM = pdf(pdM, 210); %Get prbability @ 210
plot(x,pdf(pdM,x)); %Plot male pdf

FEMALE_PRIOR = (length(xf))/(length(xm) + length(xf)); %Female prior (from


data)
femalePostNum = FEMALE_PRIOR*pdfF; %Female posterior numerator

MALE_PRIOR = (length(xm))/(length(xm) + length(xf)); %Male prior (from


data);
malePostNum = MALE_PRIOR*pdfM; %Male posterior numerator

postProbF = (femalePostNum)/(femalePostNum+malePostNum); %Post prob female


postProbM = (malePostNum)/(femalePostNum+malePostNum); %Post prob male

b 1 x 1+ b2 x2 +…+ bn xn
μb =
b 1+ b2+ …+b n
2
b1 ( x1 −μ 1 )2+ …+b n ( x n−μ n) 2
σ b=
b1 +b 2+ … bn

bn

N
1 x−x t
p ( x) = ∑
Nh t =1
K
h ( )
1
K ( u )=
{ 1 ,∧¿ x∨¿
2
0 ,∧otherwise

¿( x∈bin)
p ( x t )=
Nh

1
8
( 3× 3 ×3 ×1 ×1 ×2 ×2 ×1)=6 .44 ×10−6
( 1× 8 )

y=Step
(∑ x w )
i
i i

display(neuron(3));

function output = neuron(pointNum)


rng('default'); %Seed rng
x = randn(40,2); %Randomly generate data
x(21:40,1) = x(21:40,1)+5; %Create another class
figure;
plot(x(1:20,1),x(1:20,2),'g+');
grid;
hold on;
plot(x(21:40,1),x(21:40,2),'m+');
w0 = rand; %Randomly init bias
w1 = rand; %Randomly init weight 1
w2 = rand; %Randomly init weight 2
fplot(@(xd) w0 + (xd*w1) + (xd*w2),[-3,3]);

%Calculate point classification


wsumin = w0 + (x(pointNum,1)*w1) + (x(pointNum,2)*w2);
if (wsumin>0.5) %If more likely green class
output = 1; %Output 1
else %More likely magenta class
output = 0; %Output 0;
end
end

w i=w i+ ∆ w i
∆ wi=η ( y i −t i ) x i

−η ( d E2k )
∆ wi = =2 η E k x i=2 η ( t k −w o−w1 x 1−w2 x2 ) x i
d wi

clear all; close all; clc;

data = load('mnist_train.mat'); %Load training data


mNistInputs = data.train_X; %Load mnist training data
mNistLabels = []; %Format mnist training labes
for i = 1:10
mNistLabels = [mNistLabels, data.train_labels == i]; %Format label
end

image_name = '28x28_8.bmp'; %%Set image name

myImageInput = imread(image_name); %Load image


myImageVec = reshape((1-myImageInput)',1,784); %Reshape to vector input

%Run NPRTOOL and save mnist_example_classifier when trained

%Proceed if minist_example_classifier has been made


myImageOutputs = mnist_example_classifier(myImageVec); %Classify image
myImageProbabilities=myImageOutputs/sum(myImageOutputs); %Plot
probabilities

%Show image that is being classified


figure
imshow(reshape(1-myImageVec, 28,28)') ;
title(sprintf('Input image: %s',image_name))

%Show the histogram of probabilities


figure
bar(0:9,myImageProbabilities)
title(sprintf('Classified image probabilities: %s',image_name))
xlabel('Written Number')
ylabel('Probability')
Let y-axis transform by y=x n

K ( x i , x j ) =φ ( x i )T φ( x j)

ϕ : x → φ (x)

y= ( a× b+r )n

( a × b+r )2=ab+a2 b2 +r 2=(a , a 2 , r )∙(b ,b 2 , r )


2

y=e−γ ( a−b)

load fisheriris

y = species; %Load species data as label


X = meas; %Load meas data as features
cp = cvpartition(y,'k',10); %Partition into 10 folds

classF = @(XTRAIN,ytrain,XTEST)(classify(XTEST,XTRAIN, ytrain));

cvMCR = crossval('mcr',X,y,'predfun',classF, 'partition',cp); %Apply cross


val

rng('default'); %For consistency


[Xtrain, Ytrain] = cancer_dataset; %Load cancer dataset

net = patternnet(10); %Define network


net = train(net, Xtrain, Ytrain); %Train network
YPredicted = net(Xtrain); %Determine predicted from the XTraining
plotconfusion(Ytrain,YPredicted); %Plot confusion between true vs pred

load fisheriris; %Load iris data


pred = meas(51:end,1:2);
resp = (1:100)'>50; % Versicolor = 0, virginica = 1
mdl = fitglm(pred,resp,'Distribution','binomial','Link','logit'); %Fit
linear regression
scores = mdl.Fitted.Probability;
[X,Y,T,AUC] = perfcurve(species(51:end,:),scores,'virginica');
plot(X,Y)
xlabel('False positive rate')
ylabel('True positive rate')
title('ROC for Classification by Logistic Regression')

load fisheriris; %Load iris data


cTree = fitctree(meas, species); %Fit classification tree
view(cTree, 'mode', 'graph'); %Plot the classification tree

load carsmall %Load carsmall data


X = [Horsepower Weight]; %Set input features
rTree = fitrtree(X,MPG,'MinParent', 30); %Fit regression tree
view(rTree, 'mode', 'graph');

P ( A=1 , B=0 , C=1 )=P ( A=1 ) ∙ P ( B=0 ) ∙ P(C=1)

P ( W )=∑ P ( R , W )=P ( W |R ) P ( R ) + P ( W | R ) P( R)
R

P ( W |R ) P ( R )
P ( R∨W )=
P(W )

P ( X ,Y ∨Z ) =P ( X∨Z ) P(Y ∨Z )
P ( X|Y , Z )=P( X∨Z)

P ( W |C )=P (W |R ) P ( R|C ) + P (W | R ) P ( R∨C)

P ( R|S )=∑ P(R ,C∨S)=P ( R|C ) P ( C|S )+ P ( R| C ) P( C∨S)


C

P ( R|C ) P ( C ) P ( R|C ) P (C )
P ( C|R ) = =
P (R ) P ( R|C ) P ( C ) + P ( R| C ) P ( C )
P ( W )=∑ P ( W , R , S )=P ( W |R , X ) P ¿ ¿
R ,S

P(R) = 0.4
Rain

P(W|R) = 0.9
Wet P(W|~R) = 0.2
Grass

p ( x|θ ) p (θ )
p ( θ|x )= αL ( θ|x ) p ( θ ) ×
p(x)

close all; clear all; clc;

%Bernolli is p(X|q) = product(q^(x^t)(1-q)^(1-x^t))


%Beta is the prior distribution with parameters (beta and alpha)

%Specify prior
alpha = 5;
beta = 10;

%Check it out via a sample of 500 points


xPrior = betarnd(alpha, beta, 500, 1);
%histogram(xPrior);
figure
hold on

%Specify posterior
A1 = 20; %Sum of sample
N1 = 30; %Size of the sample

xPost1 = betarnd(A1+alpha-1, N1-A1+beta-1,500,1);


histogram(xPost1); %Plot histogram

%Specify a different posterior


A2 = 200; %Sum of the sample
N2 = 300; %Size of the sample

xPost2 = betarnd(A2+alpha-1,N2-A2+beta-1,500,1); %Generate beta


histogram(xPost2); %Plot histogram
clear all; close all; clc;

%Using internal dataset


load ionosphere
inputData = array2table(X);
outputLabels = cell2table(Y);

%Read table of values if a CSV


%rawData = readtable('hw1mystery.csv', 'header', 1);
%inputData = rawData(1:50);
%outputLabels = rawData(51);

data = cleanData(inputData);
data.labels = cleanData(outputLabels);

for n = 1:size(data.X,2)
feature = data.X(:,n);

nUnique = length(unique(feature));
%Determine type of data such that
%0 CONSTANT: Feature is constant for all values
%1 BINARY: Feature only takes two distinct values
%2 CATAGORICAL: Feature only takes THRESH % values
%3 REGRESSION: Feature takes a large number of distinct values

if (nUnique == 1)
data.type = [data.type; [0, nUnique]]; %CONSTANT

elseif (nUnique == 2)
data.type = [data.type; [1, nUnique]]; %BINARY

elseif ((nUnique)/length(data.X(:,n))) < 0.1


data.type = [data.type; [2, nUnique]]; %CATAGORICAL

else
data.type = [data.type; [3, nUnique]]; %REGRESSION
end

%Calculate mean and standard deviation


data.mean = [data.mean; mean(feature)];
data.stdev = [data.stdev; std(feature)];
data.range = [data.range; min(feature) max(feature)];
end

figure();
heatmap(corr(data.X));
data.corr = corr(data.X);
[valMax, idxMax] = max(abs(data.corr - eye(size(data.X,2))));
[valMin, idxMin] = min(abs(data.corr));

[sortMaxVal, sortMaxIdx] = sort(valMax, 'descend');


[sortMinVal, sortMinIdx] = sort(valMin, 'ascend');
nMax = 1:size(data.X,2);
nMin = 1:size(data.X,2);

data.maxCorr = [nMax(sortMaxIdx); idxMax(sortMaxIdx); sortMaxVal];


data.minCorr = [nMin(sortMinIdx); idxMax(sortMinIdx); sortMinVal];

fprintf("Max corr between features %d and %d with correlation of %d\n",...


data.maxCorr(1,1), data.maxCorr(2,1), data.maxCorr(3,1));
fprintf("Min corr between features %d and %d with correlation of %d\n",...
data.minCorr(1,1), data.minCorr(2,1), data.minCorr(3,1));

function data = cleanData(rawData)


%Exploratory Analysis
data.X = [];
data.Y = [];
data.type = [];
data.mean = [];
data.stdev = [];
data.numNan = [];
data.nanIdx = [];
data.range = [];

data.mapping.fNo = [];
data.mapping.labels = [];

for n = 1:size(rawData,2)
feature = table2array( rawData(:,n));
if(isa(feature, 'cell')) %Must contain at least 1 string
nan = 0; %NAN Count
nanIdx = []; %Indexes of NAN
for i = 1:length(feature)
if isempty(str2num(cell2mat(feature(i)))) %Empty is NAN
nan = nan + 1; %Add to count
nanIdx = [nanIdx i]; %Save index
end
end
data.numNan = [data.numNan; [n nan]];

%Less than 90% are not numbers, this is regressive data


if(nan/length(feature) < 0.90)
fprintf('Removing strings from numerical data - Feature
%d\n', n);
feature{nanIdx} = NaN;
feature = str2double(feature);
%More than 90% are not numbers, this is catagorical data
else
fprintf('Converting catagorical data to doubles - Feature
%d\n', n);
[feature, map] = grp2idx(feature);
data.mapping.fNo = [data.mapping.fNo; n];
data.mapping.labels = [data.mapping.labels map];
end
end

if((size(unique(feature), 1) == 1) && (feature(1) == 0))


fprintf("Feature %d is a constant 0 " ,n);
fprintf("and will present as NaN in correlation
caluclations.\n");
end
%Feature should have been converted to a double
if(isa(feature,'double'))
[feature TF] = fillmissing(feature, 'linear');
data.X = [data.X feature];
%"Uh oh"
else
display("Something went wrong - Feature %d was removed", n);
end
end
end
clear all; close all; clc; %Clear all variables and plots

CMAP = ['b','r','g','y','c','m','w']; %Colour mapping for graphs

%------------START EXPLORATORY ANALYSIS-----------------


%If using internal dataset, load
load fisheriris %Load internal dataset
inputData = array2table(meas); %Input feature array as table
outputLabels = cell2table(species); %Output label array as table

%If using external .csv file, readtable


%rawData = readtable('pima_indians_diabetes.csv', 'header', 1);
%inputData = rawData(:,1:8); %Input feature array as table
%outputLabels = rawData(:,9); %Output label array as table

%Ensure the data is clean (all numeric and NaN removed)


[data.X, data.numNanX, data.mappingX] = cleanData(inputData); %Clean input
[data.Y, data.numNanY, data.mappingY] = cleanData(outputLabels); %Clean
output

data.CLASSES = unique(data.Y); %Get number of classes as num unique labels

%Apply exploratory analysis of dataset


data.statX = exploreData(data.X); %Explore input characteristics
data.statY = exploreData(data.Y); %Explore label characteristics

rng('default'); %For consistency


%Random sampling for training data
trainSample = round((length(data.X)- 1).*rand(100,1) + 1);

%Split into training and validation datasets


data.trainingX = data.X(trainSample,:) - mean(data.X(trainSample, :));
%Center data
data.trainingY = data.Y(trainSample,:);

rng(0, 'twister'); %For consistency, different to training data


%Random sampling for validation data
valSample = round((length(data.X)- 1).*rand(100,1) + 1);

data.validationX = data.X(valSample,:) - mean(data.X(valSample,:)); %Center


data
data.validationY = data.Y(valSample,:);

%------------START PRINCIPLE COMPONENT ANALYSIS-----------------


PCA = performPCA(data.statX.cov); %Apply PCA to training dataset
data.trainingXProj = data.trainingX*PCA.PCEigVect; %Project training
dataset

figure(); %Generate new figure


subplot(2,1,1) %Add to subplot (Position 1)
title('Scatter of original training data');
xlabel('PC1');
ylabel('PC2');
hold on;
for n = 1:length(data.CLASSES) %Iterate over classes
%Plot scatterplot of the original training dataset (no PCA projection)
scatter(data.trainingX(data.trainingY == n, 1), ...
data.trainingX(data.trainingY == n, 2), CMAP(n));
end

subplot(2,1,2) %Add subplot (Position 2)


hold on;
title('Scatter of training data projected to principal components');
xlabel('PC1');
ylabel('PC2');
hold off;
for n = 1:length(data.CLASSES)
%Plot scatterplot of the projected PCA training dataset
scatter(data.trainingXProj(data.trainingY == n,1), ...
data.trainingXProj(data.trainingY == n,2), CMAP(n));
end

%------------START K-MEANS CLUSTERING-----------------


%Apply kMeans clusering to the unmodified dataset
[kMeans.centerNoMod,kMeans.posNoMod] = ...
kMeanClust(data.trainingX(:,1:2), length(data.CLASSES));

%Apply kMeans clustering to the PCA projected dataset


[kMeans.centerPCA,kMeans.posPCA] = ...
kMeanClust(data.trainingXProj, length(data.CLASSES));

figure(); %Generate new figure


subplot(2,1,1); %Add subplot (Position 1)
hold on;
title('K-Means classification on original training data');
xlabel('PC1');
ylabel('PC2');
for n = 1:length(data.CLASSES)
%Plot scatter plot of the original data, catagorised by kMeans
scatter(data.trainingX(kMeans.posNoMod == n, 1), ...
data.trainingX(kMeans.posNoMod == n, 2), CMAP(n));
end
%Plot the K class converged means
plot(kMeans.centerNoMod(:,1), kMeans.centerNoMod(:,2), 'X',...
'MarkerEdgeColor','k', 'LineWidth',3);

subplot(2,1,2); %Add subplot (Position 2)


hold on;
title('k-Means classification on PCA projected data');
xlabel('PC1');
ylabel('PC2');
for n = 1:length(data.CLASSES)
%Plot scatter plot of the PCA project data, catagorised by kMeans
scatter(data.trainingXProj(kMeans.posPCA == n, 1), ...
data.trainingXProj(kMeans.posPCA == n, 2), CMAP(n));
end
%Plot the K class converged means
plot(kMeans.centerPCA(:,1), kMeans.centerPCA(:,2), 'X',...
'MarkerEdgeColor','k', 'LineWidth',3);
hold off;

%Calculate classification accuracy of the No Modification Data kMeans


kMeans.predNoMod = round(100 - sum((data.trainingY - kMeans.posNoMod) ~=
0)/...
(length(data.trainingY))*100,2);
fprintf('k-Means clustering of the Unmodified dataset had an accuracy of
%.2f%%\n',...
kMeans.predNoMod);

%Calculate classification accuracy of the PCA Projected Data kMeans


kMeans.predPCA = round(100 - sum((data.trainingY - kMeans.posPCA) ~= 0)/...
(length(data.trainingY))*100,2);
fprintf('k-Means clustering of the PCA data set had an accuracy of %.2f%
%\n',...
kMeans.predPCA);

%----------START MULTIVARIATE GAUSSIAN CLASSIFICATION FITTING-------------


%Fit MVG to projected training datset and verify with validation dataset
MVG = MVGFit(data.CLASSES, data.trainingX, data.trainingY, ...
data.validationX, data.validationY);
%Calculate classification accuracy of the MVG on projected training dataset
fprintf('Fitting Multivariate Gaussian to unmodified data set had a
validation accuracy of %d%%\n',...
MVG.VAccuracy);

function MVG = MVGFit(K, tX, tY, vX, vY)


MVG.TPosterior = [];
MVG.TPostNumSum = zeros(size(K, 2), 1);

for n = 1:length(K)
classFeatures = tX(tY == n,:);
MVG.TStat(n).prior = length(classFeatures)/length(tY);
MVG.TStat(n).mean = mean(classFeatures);
MVG.TStat(n).cov = cov(classFeatures);
MVG.TStat(n).postNum = mvnpdf(tX, MVG.TStat(n).mean, ...
MVG.TStat(n).cov)*MVG.TStat(n).prior;
end

for n = 1:length(K)
MVG.TPostNumSum = MVG.TPostNumSum + MVG.TStat(n).postNum;
end

figure();
hold on
for n = 1:length(K)
MVG.TStat(n).post = (MVG.TStat(n).postNum)./(MVG.TPostNumSum);
MVG.TPosterior = [MVG.TPosterior MVG.TStat(n).post];
end
hold off;

[~, MVG.predTrain] = max(MVG.TPosterior, [], 2);


MVG.TAccuracy = (1 - sum(MVG.predTrain ~= tY)/length(tY))*100;

MVG.VPosterior = [];
MVG.VPostNumSum = zeros(size(K, 2) ,1);

for n = 1:length(K)
MVG.VStat(n).postNum = mvnpdf(vX, ...
MVG.TStat(n).mean, MVG.TStat(n).cov)*MVG.TStat(n).prior;
end

for n = 1:length(K)
MVG.VPostNumSum = MVG.VPostNumSum + MVG.VStat(n).postNum;
end

for n = 1:length(K)
MVG.VStat(n).post = (MVG.VStat(n).postNum)./(MVG.VPostNumSum);
MVG.VPosterior = [MVG.VPosterior MVG.VStat(n).post];
end

[~, MVG.predValid] = max(MVG.VPosterior, [], 2);


MVG.VAccuracy = (1- sum(MVG.predValid ~= vY)/length(vY))*100;
end

function [centerPos, ind] = kMeanClust(data, K, epsilon)


if ~exist('epsilon','var')
epsilon = 0.1; %Threshold movement before convergence
end

centerPos = []; %Contains centroids of clusters


rng(100000); %Change
%ASSUME 2 Features (can be changed to include more)
for n = 1:2
%Add K random positions for centers
centerPos = [centerPos (max(data(:,n))...
-min(data(:,n))).*rand(K,1) + min(data(:,n))];
end

%Iterate over moving cluster means


while(true)
dists = []; %Distance
for c = 1:K
diff = data - centerPos(c,:); %Diff between centroid and data
dists = [dists sqrt(diff(:,1).^2+diff(:,2).^2)]; %add euclid
dist
end
[mValue,ind] = min(dists, [], 2); %Find mean closest to data points
movement = []; %Track movement in center
for c = 1:K %Iterate over clusters
newCenter = mean(data(ind==c, :)); %Find new center on means
mov = newCenter - centerPos(c,:); %Movement of center
movement(c,:) = sqrt(mov(:,1).^2+mov(:,2).^2); %Add euclid dist
centerPos(c, :) = newCenter; %Set new center
end
if(movement < epsilon) %Converged if center moved small distance
break;
end
end

end

function PCA = performPCA(cov)


%Apply PCA to find principal components
NPCA = 2;
[eigVect, eigVal] = eig(cov);
[d, ind] = sort(diag(eigVal), 'descend');
PCA.eigVal = (diag(eigVal(ind, ind)));
PCA.eigVect = eigVect(:, ind);

PCA.PCEigVal = reshape(PCA.eigVal(1:NPCA,:),[1,2]);
PCA.PCEigVect = PCA.eigVect(:,1:NPCA);

[dir,midx] = max(abs(PCA.PCEigVect));
for i = 1:NPCA
percent = abs(dir(:,i))/sum(abs(PCA.PCEigVect(:,i)))*100;
fprintf('PC%d has %.2f%% of its weight in the direction of feature
%d \n',i, round(percent,2), midx(i));
end

PCA.varCaptured = (sum(PCA.PCEigVal)/(sum(PCA.eigVal)))*100;
fprintf("PCA with %d components is capturing %.2f%% of the
variation\n",...
NPCA, round(PCA.varCaptured,2));

figure()
plot(linspace(-1,800,length(PCA.eigVect)), PCA.eigVal, '-x');
xlabel('Value index');
ylabel('EigenValue');
title('Scree graph');

figure();
varProp = [];
for n = 1:length(PCA.eigVal)
varProp = [varProp sum(PCA.eigVal(1:n))/(sum(PCA.eigVal))];
end
plot(varProp);
xlabel('Number of eigenvalues');
ylabel('Portion of variance captured');
title('Percentage variance captured per number of Principal
components');
end

function stat = exploreData(data)


stat.type = [];
stat.mean = [];
stat.stdev = [];
stat.range = [];
for n = 1:size(data,2)
feature = data(:,n);

nUnique = length(unique(feature));
%Determine type of data such that
%0 CONSTANT: Feature is constant for all values
%1 BINARY: Feature only takes two distinct values
%2 CATAGORICAL: Feature only takes THRESH % values
%3 REGRESSION: Feature takes a large number of distinct values

if (nUnique == 1)
stat.type = [stat.type; [0, nUnique]]; %CONSTANT

elseif (nUnique == 2)
stat.type = [stat.type; [1, nUnique]]; %BINARY
elseif ((nUnique)/length(data(:,n))) < 0.1
stat.type = [stat.type; [2, nUnique]]; %CATAGORICAL

else
stat.type = [stat.type; [3, nUnique]]; %REGRESSION
end

%Calculate mean and standard deviation


stat.mean = [stat.mean; mean(feature)];
stat.stdev = [stat.stdev; std(feature)];
stat.range = [stat.range; min(feature) max(feature)];
end
if(size(data,2) > 1)
stat.corr = corr(data);
stat.cov = cov(data);
figure();
heatmap(stat.corr);

[valMax, idxMax] = max(abs(stat.corr - eye(size(data,2))));


[valMin, idxMin] = min(abs(stat.corr));

[sortMaxVal, sortMaxIdx] = sort(valMax, 'descend');


[sortMinVal, sortMinIdx] = sort(valMin, 'ascend');
nMax = 1:size(data,2);
nMin = 1:size(data,2);

stat.maxCorr = [nMax(sortMaxIdx); idxMax(sortMaxIdx); sortMaxVal];


stat.minCorr = [nMin(sortMinIdx); idxMax(sortMinIdx); sortMinVal];

fprintf("Max corr between features %d and %d with correlation of


%.2f\n",...
stat.maxCorr(1,1), stat.maxCorr(2,1),
round(stat.maxCorr(3,1),2));
fprintf("Min corr between features %d and %d with correlation of
%.2f\n",...
stat.minCorr(1,1), stat.minCorr(2,1),
round(stat.minCorr(3,1),2));
end
end

function [dataOutput, numNan, mapping] = cleanData(rawData)


dataOutput = [];
numNan = [];
mapping.fNo = [];
mapping.labels = [];

for n = 1:size(rawData,2)
feature = table2array(rawData(:,n));
if(isa(feature, 'cell')) %Must contain at least 1 string
nan = 0; %NAN Count
nanIdx = []; %Indexes of NAN
for i = 1:length(feature)
if isempty(str2num(cell2mat(feature(i)))) %Empty is NAN
nan = nan + 1; %Add to count
nanIdx = [nanIdx i]; %Save index
end
end
numNan = [numNan; [n nan]];

%Less than 90% are not numbers, this is regressive data


if(nan/length(feature) < 0.90)
fprintf('Removing strings from numerical data - Feature
%d\n', n);
feature{nanIdx} = NaN;
feature = str2double(feature);
%More than 90% are not numbers, this is catagorical data
else
fprintf('Converting catagorical data to doubles - Feature
%d\n', n);
[feature, map] = grp2idx(feature);
mapping.fNo = [mapping.fNo; n];
mapping.labels = [mapping.labels map];
end
end

if((size(unique(feature), 1) == 1) && (feature(1) == 0))


fprintf("Feature %d is a constant 0 " ,n);
fprintf("and will present as NaN in correlation
caluclations.\n");
end

%Feature should have been converted to a double


if(isa(feature,'double'))
[feature TF] = fillmissing(feature, 'linear');
dataOutput = [dataOutput feature];
%"Uh oh"
else
display("Something went wrong - Feature %d was removed", n);
end
end
end

You might also like