Download as pdf or txt
Download as pdf or txt
You are on page 1of 12

3cqonq1zj

June 6, 2024

1 GUIA DE LABORATORIO
1.1 APELLIDOS, Nombres: AMES CAMAYO, DANIEL VIDES
Fecha: 06 de Octubre del 2023

2 STANDARD LIBRARIES:
[103]: from matplotlib import style
style.use("ggplot")

from pathlib import Path


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

3 CUSTOMIZED LIBRARIES:
[104]: import seaborn as sns
from scipy.stats import randint as sp_randint
from sklearn.decomposition import PCA

4 EXTRACCION DE DATOS:
[105]: df = pd.read_csv("https://raw.githubusercontent.com/tatsath/fin-ml/master/
↪Chapter%208%20-%20Unsup.%20Learning%20-%20Clustering/Data_MasterTemplate.

↪csv", parse_dates=True, index_col=0)

df

[105]: MMM AXP AAPL BA CAT \


Date
2000-01-03 29.847043 35.476634 3.530576 26.650218 14.560887
2000-01-04 28.661131 34.134275 3.232839 26.610431 14.372251
2000-01-05 30.122175 33.959430 3.280149 28.473758 14.914205
2000-01-06 31.877325 33.959430 2.996290 28.553331 15.459153
2000-01-07 32.509812 34.433913 3.138219 29.382213 15.962182

1
… … … … … …
2019-01-31 200.300000 102.700000 166.440000 385.620000 133.160000
2019-02-01 199.160000 103.060000 166.520000 387.430000 130.910000
2019-02-04 200.210000 103.420000 171.250000 397.000000 130.880000
2019-02-05 201.120000 103.900000 174.180000 410.180000 132.000000
2019-02-06 202.570000 104.960000 174.240000 411.110000 130.540000

CVX CSCO KO DIS DWDP … \


Date …
2000-01-03 21.582046 43.003876 16.983583 23.522220 NaN …
2000-01-04 21.582046 40.577200 17.040950 24.899860 NaN …
2000-01-05 22.049145 40.895453 17.228147 25.781550 NaN …
2000-01-06 22.903343 39.781569 17.210031 24.899860 NaN …
2000-01-07 23.305926 42.128682 18.342270 24.506249 NaN …
… … … … … … …
2019-01-31 114.650000 47.290000 48.130000 111.520000 53.81 …
2019-02-01 118.370000 47.340000 48.700000 111.300000 53.47 …
2019-02-04 119.740000 47.350000 49.250000 111.800000 53.13 …
2019-02-05 119.480000 47.260000 49.260000 112.660000 54.26 …
2019-02-06 118.880000 47.480000 49.260000 111.410000 53.21 …

NKE PFE PG TRV UTX \


Date
2000-01-03 4.701180 16.746856 32.227726 20.158885 21.319030
2000-01-04 4.445214 16.121738 31.596399 19.890099 20.445803
2000-01-05 4.702157 16.415912 31.325831 20.085579 20.254784
2000-01-06 4.677733 16.972739 32.438168 20.122232 20.998392
2000-01-07 4.677733 18.123166 35.023602 20.922479 21.830687
… … … … … …
2019-01-31 81.880000 42.450000 96.470000 125.540000 118.070000
2019-02-01 81.510000 42.880000 97.470000 126.490000 118.980000
2019-02-04 81.990000 42.440000 98.030000 126.040000 119.140000
2019-02-05 82.860000 42.110000 97.440000 126.180000 120.770000
2019-02-06 82.710000 42.330000 97.920000 125.550000 121.360000

UNH VZ V WMT WBA


Date
2000-01-03 5.841355 22.564221 NaN 47.337599 21.713237
2000-01-04 5.766368 21.833915 NaN 45.566248 20.907354
2000-01-05 5.753327 22.564221 NaN 44.503437 21.097421
2000-01-06 5.964159 22.449405 NaN 45.126952 20.527220
2000-01-07 6.662948 22.282692 NaN 48.535033 21.051805
… … … … … …
2019-01-31 270.200000 55.060000 135.01 95.830000 72.260000
2019-02-01 268.720000 54.550000 140.15 93.860000 71.880000
2019-02-04 268.210000 54.040000 141.50 94.770000 71.460000
2019-02-05 266.310000 54.140000 142.53 95.600000 71.310000

2
2019-02-06 269.500000 53.790000 141.49 95.640000 71.470000

[4804 rows x 30 columns]

4.1 TRANSFORMACION Y/O LIMPIEZA - PRE-PROCESAMIENTO: ETL


[106]: # Dropping 'Not a Number' columns for Dow Chemicals (DWDP) and Visa (V)
# Eliminación de las columnas "No es un número" para Dow Chemicals (DWDP) y␣
↪Visa (V)

df.drop(['DWDP', 'V'], axis=1, inplace=True)


df.head(2)

[106]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-03 29.847043 35.476634 3.530576 26.650218 14.560887 21.582046
2000-01-04 28.661131 34.134275 3.232839 26.610431 14.372251 21.582046

CSCO KO DIS XOM … MSFT \


Date …
2000-01-03 43.003876 16.983583 23.52222 23.862240 … 38.135101
2000-01-04 40.577200 17.040950 24.89986 23.405167 … 36.846046

NKE PFE PG TRV UTX UNH \


Date
2000-01-03 4.701180 16.746856 32.227726 20.158885 21.319030 5.841355
2000-01-04 4.445214 16.121738 31.596399 19.890099 20.445803 5.766368

VZ WMT WBA
Date
2000-01-03 22.564221 47.337599 21.713237
2000-01-04 21.833915 45.566248 20.907354

[2 rows x 28 columns]

[107]: # Copying the dataframe to add features


# Copiar el marco de datos para añadir características
data = pd.DataFrame(df.copy())
data.head(2)

[107]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-03 29.847043 35.476634 3.530576 26.650218 14.560887 21.582046
2000-01-04 28.661131 34.134275 3.232839 26.610431 14.372251 21.582046

CSCO KO DIS XOM … MSFT \


Date …

3
2000-01-03 43.003876 16.983583 23.52222 23.862240 … 38.135101
2000-01-04 40.577200 17.040950 24.89986 23.405167 … 36.846046

NKE PFE PG TRV UTX UNH \


Date
2000-01-03 4.701180 16.746856 32.227726 20.158885 21.319030 5.841355
2000-01-04 4.445214 16.121738 31.596399 19.890099 20.445803 5.766368

VZ WMT WBA
Date
2000-01-03 22.564221 47.337599 21.713237
2000-01-04 21.833915 45.566248 20.907354

[2 rows x 28 columns]

[108]: # Daily Returns


# Daily Log Returns (%)

# Rendimiento diario
# Rendimiento diario (%)
datareturns = np.log(data / data.shift(1))

[109]: datareturns.head()

[109]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-03 NaN NaN NaN NaN NaN NaN
2000-01-04 -0.040544 -0.038572 -0.088100 -0.001494 -0.013040 0.000000
2000-01-05 0.049720 -0.005135 0.014528 0.067680 0.037015 0.021412
2000-01-06 0.056633 0.000000 -0.090514 0.002791 0.035887 0.038009
2000-01-07 0.019647 0.013875 0.046281 0.028616 0.032021 0.017425

CSCO KO DIS XOM … MSFT NKE \


Date …
2000-01-03 NaN NaN NaN NaN … NaN NaN
2000-01-04 -0.058084 0.003372 0.056917 -0.019340 … -0.034387 -0.055986
2000-01-05 0.007813 0.010925 0.034797 0.048536 … 0.010511 0.056193
2000-01-06 -0.027615 -0.001052 -0.034797 0.055013 … -0.034050 -0.005208
2000-01-07 0.057325 0.063716 -0.015934 -0.002939 … 0.013006 0.000000

PFE PG TRV UTX UNH VZ \


Date
2000-01-03 NaN NaN NaN NaN NaN NaN
2000-01-04 -0.038042 -0.019784 -0.013423 -0.041822 -0.012920 -0.032901
2000-01-05 0.018083 -0.008600 0.009780 -0.009387 -0.002264 0.032901
2000-01-06 0.033357 0.034893 0.001823 0.036055 0.035990 -0.005101
2000-01-07 0.065583 0.076686 0.038999 0.038871 0.110794 -0.007454

4
WMT WBA
Date
2000-01-03 NaN NaN
2000-01-04 -0.038138 -0.037821
2000-01-05 -0.023601 0.009050
2000-01-06 0.013913 -0.027399
2000-01-07 0.072806 0.025234

[5 rows x 28 columns]

[110]: datareturns_log = pd.DataFrame(datareturns.copy())

[111]: # Daily Linear Returns (%)


datareturns = data.pct_change(1)

[112]: datareturns.head(3)

[112]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-03 NaN NaN NaN NaN NaN NaN
2000-01-04 -0.039733 -0.037838 -0.084331 -0.001493 -0.012955 0.000000
2000-01-05 0.050976 -0.005122 0.014634 0.070022 0.037708 0.021643

CSCO KO DIS XOM … MSFT NKE \


Date …
2000-01-03 NaN NaN NaN NaN … NaN NaN
2000-01-04 -0.056429 0.003378 0.058568 -0.019155 … -0.033802 -0.054447
2000-01-05 0.007843 0.010985 0.035409 0.049733 … 0.010567 0.057802

PFE PG TRV UTX UNH VZ \


Date
2000-01-03 NaN NaN NaN NaN NaN NaN
2000-01-04 -0.037327 -0.019590 -0.013333 -0.040960 -0.012837 -0.032366
2000-01-05 0.018247 -0.008563 0.009828 -0.009343 -0.002262 0.033448

WMT WBA
Date
2000-01-03 NaN NaN
2000-01-04 -0.037420 -0.037115
2000-01-05 -0.023325 0.009091

[3 rows x 28 columns]

[113]: # Dow Jones Equal Weighted rETURN


datareturns["DJIA"] = datareturns.mean(axis=1)
datareturns["DJIA"].head(3)

5
[113]: Date
2000-01-03 NaN
2000-01-04 -0.031058
2000-01-05 0.019281
Name: DJIA, dtype: float64

[114]: # Data Raw


data_raw = datareturns
data_raw.dropna(how='all', inplace=True)
data_raw.head(3)

[114]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-04 -0.039733 -0.037838 -0.084331 -0.001493 -0.012955 0.000000
2000-01-05 0.050976 -0.005122 0.014634 0.070022 0.037708 0.021643
2000-01-06 0.058268 0.000000 -0.086538 0.002795 0.036539 0.038741

CSCO KO DIS XOM … NKE PFE \


Date …
2000-01-04 -0.056429 0.003378 0.058568 -0.019155 … -0.054447 -0.037327
2000-01-05 0.007843 0.010985 0.035409 0.049733 … 0.057802 0.018247
2000-01-06 -0.027237 -0.001052 -0.034198 0.056555 … -0.005194 0.033920

PG TRV UTX UNH VZ WMT \


Date
2000-01-04 -0.019590 -0.013333 -0.040960 -0.012837 -0.032366 -0.037420
2000-01-05 -0.008563 0.009828 -0.009343 -0.002262 0.033448 -0.023325
2000-01-06 0.035509 0.001825 0.036713 0.036645 -0.005088 0.014011

WBA DJIA
Date
2000-01-04 -0.037115 -0.031058
2000-01-05 0.009091 0.019281
2000-01-06 -0.027027 0.002579

[3 rows x 29 columns]

[115]: # Normalizing the returns


# Normalizar los rendimientos

data = (data_raw - data_raw.mean()) / data_raw.std()


data.head(3)

[115]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-04 -2.783305 -1.714531 -3.315064 -0.118649 -0.672149 -0.030177
2000-01-05 3.491453 -0.250475 0.522450 3.667073 1.828679 1.321853

6
2000-01-06 3.995818 -0.021248 -3.400666 0.108313 1.770951 2.389941

CSCO KO DIS XOM … NKE PFE \


Date …
2000-01-04 -2.304726 0.236964 3.089027 -1.283097 … -2.927985 -2.410473
2000-01-05 0.305449 0.823741 1.857074 3.247575 … 3.023666 1.148279
2000-01-06 -1.119210 -0.104682 -1.845881 3.696220 … -0.316511 2.151911

PG TRV UTX UNH VZ WMT \


Date
2000-01-04 -1.504783 -0.763609 -2.479568 -0.698935 -2.097284 -2.506437
2000-01-05 -0.671468 0.510782 -0.588825 -0.164440 2.128033 -1.568817
2000-01-06 2.659281 0.070428 2.165327 1.801922 -0.346058 0.914757

WBA DJIA
Date
2000-01-04 -2.167895 -2.739489
2000-01-05 0.502376 1.631646
2000-01-06 -1.584911 0.181378

[3 rows x 29 columns]

[116]: # Getting rid of the NaN values.


data.dropna(how='any', inplace=True)
data_raw.dropna(how='any', inplace=True)

5 MODELO(S): Modelamiento: Crear modelo, FIT (Ajustar, En-


trenar), PREDICT (generar valores ajustados y/o predicción)
[117]: import matplotlib.cm as cm

[118]: from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Normalización de los datos


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

# Creación del modelo de clustering jerárquico


linked = linkage(scaled_df, 'ward')

linked

7
[118]: array([[7.69000000e+02, 7.70000000e+02, 1.87207085e-02, 2.00000000e+00],
[1.50200000e+03, 1.50300000e+03, 2.57043425e-02, 2.00000000e+00],
[9.07000000e+02, 9.08000000e+02, 2.70617818e-02, 2.00000000e+00],
…,
[9.60100000e+03, 9.60200000e+03, 1.36129958e+02, 3.28300000e+03],
[9.59500000e+03, 9.60300000e+03, 1.63674978e+02, 1.52100000e+03],
[9.60400000e+03, 9.60500000e+03, 4.26367861e+02, 4.80400000e+03]])

[119]: corr = data.corr()


size = 7
fig, ax = plt.subplots(figsize=(size, size))
ax.matshow(corr,cmap=cm.get_cmap('coolwarm'), vmin=0,vmax=1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical',␣
↪fontsize=8);

plt.yticks(range(len(corr.columns)), corr.columns, fontsize=8);

<ipython-input-119-fd0d95bf18a3>:4: MatplotlibDeprecationWarning: The get_cmap


function was deprecated in Matplotlib 3.7 and will be removed two minor releases
later. Use ``matplotlib.colormaps[name]`` or
``matplotlib.colormaps.get_cmap(obj)`` instead.
ax.matshow(corr,cmap=cm.get_cmap('coolwarm'), vmin=0,vmax=1)

8
[120]: from scipy.cluster.hierarchy import dendrogram, linkage
dendrogram

[120]: <function scipy.cluster.hierarchy.dendrogram(Z, p=30, truncate_mode=None,


color_threshold=None, get_leaves=True, orientation='top', labels=None,
count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False,
no_labels=False, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None,
show_contracted=False, link_color_func=None, ax=None,
above_threshold_color='C0')>

[121]: Z = linkage(corr, 'average')


Z

9
[121]: array([[ 5. , 9. , 0.27207065, 2. ],
[10. , 15. , 0.45595649, 2. ],
[ 1. , 30. , 0.53472591, 3. ],
[ 6. , 13. , 0.55217128, 2. ],
[ 0. , 23. , 0.59795924, 2. ],
[18. , 32. , 0.63802964, 3. ],
[ 4. , 33. , 0.6421695 , 3. ],
[17. , 20. , 0.65092769, 2. ],
[14. , 36. , 0.6956485 , 3. ],
[12. , 34. , 0.71936254, 4. ],
[ 3. , 35. , 0.73536588, 4. ],
[ 8. , 31. , 0.76549928, 4. ],
[ 7. , 21. , 0.78186826, 2. ],
[11. , 26. , 0.78217715, 2. ],
[22. , 40. , 0.79465521, 5. ],
[39. , 43. , 0.83270992, 9. ],
[19. , 42. , 0.91368815, 3. ],
[ 2. , 38. , 0.91841374, 5. ],
[37. , 41. , 0.93692083, 5. ],
[29. , 44. , 0.9489942 , 11. ],
[27. , 45. , 0.97363533, 4. ],
[25. , 47. , 0.98299793, 6. ],
[16. , 49. , 1.01064171, 5. ],
[24. , 50. , 1.01076057, 7. ],
[51. , 52. , 1.0574304 , 12. ],
[46. , 48. , 1.06366375, 16. ],
[53. , 54. , 1.13906782, 28. ],
[28. , 55. , 1.46627118, 29. ]])

6 EVALUAR MODELOS: Métricas


[122]: from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
import pylab
c, coph_dists = cophenet(Z, pdist(corr))
c

[122]: 0.7996812457331596

[123]: plt.figure(figsize=(25, 10))


labelsize=20
ticksize=15
plt.title('Hierarchical Clustering Dendrogram for '+"DJIA", fontsize=labelsize)
plt.xlabel('stock', fontsize=labelsize)
plt.ylabel('distance', fontsize=labelsize)
dendrogram(

10
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
labels = corr.columns
)
pylab.yticks(fontsize=ticksize)
pylab.xticks(rotation=-90, fontsize=ticksize)
plt.savefig('dendogram_'+'DJIA'+'.png')
plt.show()

#INTERPRETACIÓN DE RESULTADOS:
[124]: # Interpretación basada en el dendrograma y los clusters formados
# Se pueden identificar cuántos clusters son óptimos y etiquetar los datos en␣
↪consecuencia.

from scipy.cluster.hierarchy import fcluster

clusters = fcluster(linked, 4, criterion='maxclust')


df['Cluster'] = clusters
df.head()

[124]: MMM AXP AAPL BA CAT CVX \


Date
2000-01-03 29.847043 35.476634 3.530576 26.650218 14.560887 21.582046
2000-01-04 28.661131 34.134275 3.232839 26.610431 14.372251 21.582046
2000-01-05 30.122175 33.959430 3.280149 28.473758 14.914205 22.049145
2000-01-06 31.877325 33.959430 2.996290 28.553331 15.459153 22.903343
2000-01-07 32.509812 34.433913 3.138219 29.382213 15.962182 23.305926

CSCO KO DIS XOM … NKE \

11
Date …
2000-01-03 43.003876 16.983583 23.522220 23.862240 … 4.701180
2000-01-04 40.577200 17.040950 24.899860 23.405167 … 4.445214
2000-01-05 40.895453 17.228147 25.781550 24.569179 … 4.702157
2000-01-06 39.781569 17.210031 24.899860 25.958680 … 4.677733
2000-01-07 42.128682 18.342270 24.506249 25.882501 … 4.677733

PFE PG TRV UTX UNH VZ \


Date
2000-01-03 16.746856 32.227726 20.158885 21.319030 5.841355 22.564221
2000-01-04 16.121738 31.596399 19.890099 20.445803 5.766368 21.833915
2000-01-05 16.415912 31.325831 20.085579 20.254784 5.753327 22.564221
2000-01-06 16.972739 32.438168 20.122232 20.998392 5.964159 22.449405
2000-01-07 18.123166 35.023602 20.922479 21.830687 6.662948 22.282692

WMT WBA Cluster


Date
2000-01-03 47.337599 21.713237 2
2000-01-04 45.566248 20.907354 2
2000-01-05 44.503437 21.097421 2
2000-01-06 45.126952 20.527220 2
2000-01-07 48.535033 21.051805 2

[5 rows x 29 columns]

12

You might also like