Proyecto Final Load Data

Libraries
pip install rasterio netCDF4 python-dotenv psycopg2
Collecting rasterio
Downloading rasterio-1.3.10-cp310-cp310-manylinux2014_x86_64.whl
(21.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.5/21.5 MB 26.1 MB/s eta
0:00:00
anylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.5/5.5 MB 44.9 MB/s eta
0:00:00
ent already satisfied: psycopg2 in /usr/local/lib/python3.10/dist-
packages (2.9.9)
Collecting affine (from rasterio)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Requirement already satisfied: attrs in
/usr/local/lib/python3.10/dist-packages (from rasterio) (23.2.0)
Requirement already satisfied: certifi in
Requirement already satisfied: click>=4.0 in
Requirement already satisfied: cligj>=0.5 in
Requirement already satisfied: numpy in
Collecting snuggs>=1.4.1 (from rasterio)
Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Requirement already satisfied: click-plugins in
Requirement already satisfied: setuptools in
Collecting cftime (from netCDF4)
Downloading cftime-1.6.3-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 14.0 MB/s eta
0:00:00
ent already satisfied: pyparsing>=2.1.6 in
/usr/local/lib/python3.10/dist-packages (from snuggs>=1.4.1->rasterio)
(3.1.2)
Installing collected packages: snuggs, python-dotenv, cftime, affine,
rasterio, netCDF4
Successfully installed affine-2.4.0 cftime-1.6.3 netCDF4-1.6.5 python-
dotenv-1.0.1 rasterio-1.3.10 snuggs-1.4.7
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sqlalchemy.orm import sessionmaker
from geopy.geocoders import Nominatim
from sqlalchemy import create_engine
from scipy.spatial import cKDTree
from pyproj import Transformer
from netCDF4 import Dataset
from sqlalchemy import text
from urllib import request
from datetime import date
import pandas as pd
import numpy as np
import rasterio
import psycopg2
import zipfile
import math
import os
Utils
def get_root_directory():
root_directory_path = "./datasets"
if not os.path.exists(root_directory_path):
os.makedirs(root_directory_path)
return root_directory_path
def get_lat_lon_values(df_forest_fire):
lat_values, lon_values = df_forest_fire['latitude'].to_numpy(),
df_forest_fire['longitude'].to_numpy()
lat_lon_values = list(set([(lat, lon) for lat, lon in
zip(lat_values, lon_values)]))
return lat_lon_values
def get_inter_extrapolated_values(df_forest_fire):
df_forest_fire['year'] =
df_forest_fire['date'].astype(str).str.slice(start=0,
stop=4).astype(int)
interpolated_values = df_forest_fire[df_forest_fire['year'] <=
2020][['latitude', 'longitude', 'year']].values
extrapolated_values = df_forest_fire[2021 <=
df_forest_fire['year']][['latitude', 'longitude', 'year']].values
df_forest_fire = df_forest_fire.drop(columns=['year'])
return interpolated_values, extrapolated_values

Forest Fire Data
Near real-time (NRT) Moderate Resolution Imaging Spectroradiometer (MODIS) Thermal
Anomalies / Fire locations - Collection 61 processed by NASA's Land, Atmosphere Near real-
time Capability for EO (LANCE) Fire Information for Resource Management System (FIRMS),
using swath products (MOD14/MYD14) rather than the tiled MOD14A1 and MYD14A1 products.
The thermal anomalies / active fire represent the center of a 1 km pixel that is flagged by the
MODIS MOD14/MYD14 Fire and Thermal Anomalies algorithm (Giglio 2003) as containing one
or more fires within the pixel. This is the most basic fire product in which active fires and other
thermal anomalies, such as volcanoes, are identified.
For more information here
def get_forest_fire_archive_dataset(root_directory_path, code):

remote_url =
f"https://firms.modaps.eosdis.nasa.gov/data/download/DL_FIRE_{code}.zi
p"
local_file = f"{root_directory_path}/forest_fire_Colombia.zip"
remote_name = f"{root_directory_path}/fire_archive_{code}.csv"
request.urlretrieve(remote_url, local_file)
with zipfile.ZipFile(local_file, 'r') as zip_ref:

zip_ref.extractall(root_directory_path)
df_forest_fire = pd.read_csv(remote_name)
os.remove(f"{root_directory_path}/fire_nrt_{code}.csv")
os.remove(f"{root_directory_path}/Readme.txt")
os.remove(remote_name)
os.remove(local_file)
return df_forest_fire
def download_forest_fire_dataset(root_directory_path):
df_viirs = get_forest_fire_archive_dataset(root_directory_path,
"SV-C2_457866")
df_modis = get_forest_fire_archive_dataset(root_directory_path,
"M-C61_457865")
df_forest_fire = pd.concat([df_viirs, df_modis])
return pd.merge(
df_forest_fire.sort_values(by="acq_date")
.rename(columns={"type": "fire_type", "acq_date": "date"})
.dropna(),
pd.DataFrame({
"fire_type": [0, 1, 2, 3],
"type": ["presumed vegetation fire", "active volcano",
"other static land source", "offshore"]
}), on="fire_type", how="left").drop(columns=["fire_type"])
def get_df_forest_fire(root_directory_path):
df_forest_fire = download_forest_fire_dataset(root_directory_path)
df_forest_fire.to_pickle(f"{root_directory_path}/forest_fire.pkl")
df_forest_fire['date'] = pd.to_datetime(df_forest_fire['date'])
return df_forest_fire
NDVI Data
This dataset contains dekadal NDVI indicators computed from NASA's Moderate Resolution
Imaging Spectroradiometer (MODIS) collection 6.1 from the Aqua and Terra satellite aggregated
by sub-national administrative units.
Included indicators are (for each dekad):
• 10 day NDVI (vim)

• NDVI long term average (vim_lta)
• 10 day NDVI anomaly [%] (viq)
The administrative units used for aggregation are based on WFP data and contain a Pcode
reference attributed to each unit. The number of input pixels used to create the aggregates, is
provided in the n_pixels column.
More information here
def download_ndvi_dataset(root_directory_path):
remote_url = "https://data.humdata.org/dataset/7f2ba5ba-8df1-41cf-
ab18-fc1da928a1e5/resource/c06298d9-0d4d-4e40-aecc-abc1da75dc4d/
download/col-ndvi-adm2-full.csv"
local_file_ndvi_dataset =
f"{root_directory_path}/ndvi_Colombia.csv"
request.urlretrieve(remote_url, local_file_ndvi_dataset)
df_ndvi = pd.read_csv(local_file_ndvi_dataset, low_memory=False)

df_ndvi = df_ndvi.drop(df_ndvi.index[0])
df_ndvi['date'] = pd.to_datetime(df_ndvi['date'])
df_ndvi = df_ndvi[df_ndvi['date'] <= pd.to_datetime("2023-12-31")]
os.remove(local_file_ndvi_dataset)
return df_ndvi
def get_ndvi_postal_codes(df_ndvi, root_directory_path):

postal_codes =
list(set(df_ndvi["ADM2_PCODE"].values.astype('str')))
postal_codes = np.array([postal_code.replace("CO", "") for
postal_code in postal_codes]).astype(int)
# postal codes
remote_url =
"https://www.datos.gov.co/api/views/ixig-z8b5/rows.csv?
accessType=DOWNLOAD"
postal_codes_path = f"{root_directory_path}/postal_codes.csv"
request.urlretrieve(remote_url, postal_codes_path)
column_name = "codigo_municipio"
df_postal_codes = pd.read_csv(postal_codes_path)
df_postal_codes[column_name] =
df_postal_codes[column_name].replace(',', '').astype(int)
df_postal_codes =
df_postal_codes.drop_duplicates(subset=column_name, keep='first')
result =
df_postal_codes[df_postal_codes[column_name].isin(postal_codes)]
[['nombre_departamento', 'nombre_municipio', 'codigo_municipio',
'codigo_postal']]
result.reset_index(drop=True, inplace=True)
result.sort_values(by="codigo_postal")
os.remove(postal_codes_path)
return result
def get_ndvi_lat_lon(geolocator, location):

try:
lat_lon_result = geolocator.geocode(location, timeout=10)
return (lat_lon_result.latitude, lat_lon_result.longitude) if
lat_lon_result else (None, None)
except:
return (None, None)
def get_ndvi_by_values(geolocator, locations, municipality_fixed):

lat_lon_ndvi = np.zeros((len(locations), 2))
for index, [municipality, department] in enumerate(locations):
municipality = municipality_fixed[municipality] if
municipality in municipality_fixed else municipality
location = get_ndvi_lat_lon(geolocator, f"{municipality},
{department}, COLOMBIA")
lat_lon_ndvi[index] = np.array(location)
return lat_lon_ndvi
def get_ndvi_lat_lon_values(result):
values = result[['nombre_municipio',
'nombre_departamento']].values
geolocator = Nominatim(user_agent="ndvi_data")
municipality_fixed = {
"VILLA DE SAN DIEGO DE UBATE": "UBATE",
"CERRO SAN ANTONIO": "SAN ANTONIO",
"SAN JUAN DE RIO SECO": "SAN JUAN DE RIOSECO",
"TOLU VIEJO": "TOLUVIEJO",
"SAN ANDRES DE TUMACO": "TUMACO",
"EL CANTON DEL SAN PABLO": "EL CANTON DE SAN PABLO",
"SAN LUIS DE SINCE": "SINCE",
"SAN JOSE DE ALBAN": "ALBAN"
}
return get_ndvi_by_values(geolocator, values, municipality_fixed)
def union_ndvi_data(df_ndvi, result, lat_lon_ndvi):

result['latitude'] = lat_lon_ndvi[:, 0]
result['longitude'] = lat_lon_ndvi[:, 1]
result.rename(columns={'codigo_municipio': 'ADM2_PCODE'},
inplace=True)
result['ADM2_PCODE'] = 'CO' + result['ADM2_PCODE'].astype(str)
merged_df_ndvi = pd.merge(df_ndvi, result[['latitude',
'longitude', 'ADM2_PCODE']], on='ADM2_PCODE', how='left')
merged_df_ndvi = merged_df_ndvi.drop(columns={'adm2_id',
'ADM2_PCODE'})
return merged_df_ndvi
def collect_ndvi_data(root_directory_path, merged_df_ndvi,

df_forest_fire):
values = {key: [] for key in merged_df_ndvi.columns}
merged_df_ndvi = merged_df_ndvi.dropna()
for year in range(2002, 2024):

# Filtramos los datos
date_min, date_max = pd.to_datetime(f'{year}-01-01'),
pd.to_datetime(f'{year}-12-31')
df_ndvi_temp = merged_df_ndvi[(date_min <=
merged_df_ndvi['date']) & (merged_df_ndvi['date'] <= date_max)]
df_forest_fire_temp = df_forest_fire[(date_min <=
df_forest_fire['date']) & (df_forest_fire['date'] <= date_max)]
df_ndvi_temp.reset_index(drop=True, inplace=True)
df_forest_fire_temp.reset_index(drop=True, inplace=True)
init_date = pd.to_datetime(f'{year}-01-01')
# forest fire values

lat_values = df_forest_fire_temp['latitude'].values
lon_values = df_forest_fire_temp['longitude'].values
time_values = (df_forest_fire_temp['date'] -
init_date).dt.days.values
# ndvi values
lat = df_ndvi_temp['latitude'].values
lon = df_ndvi_temp['longitude'].values
time = (df_ndvi_temp['date'] - init_date).dt.days.values
points = np.vstack((lat, lon, time)).T

tree = cKDTree(points)
query_points = np.vstack((lat_values, lon_values,
time_values)).T
_, indexes = tree.query(query_points)
for key in ['latitude', 'longitude', 'date']:

values[key] += list(df_forest_fire_temp[key].values)
for key in ['n_pixels', 'vim', 'vim_avg', 'viq']:

values[key] = np.append(values[key],
df_ndvi_temp.iloc[indexes][key].values).astype(float)
df_ndvi = pd.DataFrame(values).sort_values(by="date").dropna()
df_ndvi.to_pickle(f"{root_directory_path}/ndvi.pkl")
def save_df_ndvi(root_directory_path, df_forest_fire):

df_ndvi = download_ndvi_dataset(root_directory_path)
result = get_ndvi_postal_codes(df_ndvi, root_directory_path)
lat_lon_ndvi = get_ndvi_lat_lon_values(result)
merged_df_ndvi = union_ndvi_data(df_ndvi, result, lat_lon_ndvi)
collect_ndvi_data(root_directory_path, merged_df_ndvi,
df_forest_fire)
Global Climate Data

TerraClimate is a dataset of monthly climate and climatic water balance for global terrestrial
surfaces from 1958-2019. These data provide important inputs for ecological and hydrological
studies at global scales that require high spatial resolution and time-varying data. All data have
monthly temporal resolution and a ~4-km (1/24th degree) spatial resolution. The data cover the
period from 1958-2020. We plan to update these data periodically (annually).
def
check_latlon_bounds(lat,lon,lat_index,lon_index,lat_target,lon_target)
:
#check final indices are in right bounds
if(lat[lat_index]>lat_target):
if(lat_index!=0):
lat_index = lat_index - 1
if(lat[lat_index]<lat_target):
if(lat_index!=len(lat)):
lat_index = lat_index +1
if(lon[lon_index]>lon_target):
if(lon_index!=0):
lon_index = lon_index - 1
if(lon[lon_index]<lon_target):
if(lon_index!=len(lon)):
lon_index = lon_index + 1
return [lat_index, lon_index]
def get_indexes(data, points):

data_reshaped = data.filled().reshape(-1, 1)
tree = cKDTree(data_reshaped)
query_points = points.to_numpy().reshape(-1, 1)
_, indexes = tree.query(query_points)
return indexes
def get_data_by_date(varname, filehandle, time_values, lat_values,

lon_values, year, lat_min, lon_min, lat_max, lon_max):
# subset in space (lat/lon)
lathandle = filehandle.variables['lat']
lonhandle = filehandle.variables['lon']
lat=lathandle[:]
lon=lonhandle[:]
# find indices of target lat/lon/day

lat_index_min = (np.abs(lat-lat_min)).argmin()
lat_index_max = (np.abs(lat-lat_max)).argmin()
lon_index_min = (np.abs(lon-lon_min)).argmin()
lon_index_max = (np.abs(lon-lon_max)).argmin()
[lat_index_min,lon_index_min] = check_latlon_bounds(lat, lon,

lat_index_min, lon_index_min, lat_min, lon_min)
[lat_index_max,lon_index_max] = check_latlon_bounds(lat, lon,
lat_index_max, lon_index_max, lon_max, lon_max)
if(lat_index_min>lat_index_max):
lat_index_range = range(lat_index_max, lat_index_min+1)
else:
lat_index_range = range(lat_index_min, lat_index_max+1)
if(lon_index_min>lon_index_max):
lon_index_range = range(lon_index_max, lon_index_min+1)
else:
lon_index_range = range(lon_index_min, lon_index_max+1)
lat=lat[lat_index_range]
lon=lon[lon_index_range]
# subset in time
timehandle=filehandle.variables['time']
time=timehandle[:]
time_min = (date(year,1,1)-date(1900,1,1)).days
time_max = (date(year,12,31)-date(1900,1,1)).days
time_index_min = (np.abs(time-time_min)).argmin()
time_index_max = (np.abs(time-time_max)).argmin()
time_index_range = range(time_index_min, time_index_max+1)
time = timehandle[time_index_range]
# subset data
datahandle = filehandle.variables[varname]
data = datahandle[time_index_range, lat_index_range,
lon_index_range]
# Indexes
time_indexes = get_indexes(time, time_values)
lat_indexes = get_indexes(lat, lat_values)
lon_indexes = get_indexes(lon, lon_values)
return list(data[time_indexes, lat_indexes,

lon_indexes].filled(np.nan))
def get_data_country(df_modis, varnames, datasets):

values = {varname: [] for varname in ["date", "latitude",
"longitude"] + varnames}
df_modis["date"] = pd.to_datetime(df_modis["date"])
df_modis = df_modis.sort_values(by="date")

df = df_modis[df_modis["date"] <= pd.to_datetime(f"{year}-12-
31")]
df = df[pd.to_datetime(f"{year}-01-01") <= df["date"]]
date_values, lat_values, lon_values = df['date'],

df['latitude'], df['longitude']
lat_min, lon_min = lat_values.min(), lon_values.min()
lat_max, lon_max = lat_values.max(), lon_values.max()
values['date'] += [str(date_.date()) for date_ in date_values]

values['latitude'] += list(lat_values.values)
values['longitude'] += list(lon_values.values)
time_values = (date_values - pd.to_datetime("1900-01-
01")).dt.days
for varname in varnames:

filehandle = datasets[f"{year}-{varname}"]
values[varname] += get_data_by_date(varname, filehandle,
time_values, lat_values, lon_values, year, lat_min, lon_min, lat_max,
lon_max)
return values
def download_global_climate_dataset(varnames):
datasets = {}
pathname =
f"http://thredds.northwestknowledge.net:8080/thredds/dodsC/TERRACLIMAT
E_ALL/data/TerraClimate_{varname}_{year}.nc"
filehandle = Dataset(pathname, 'r', format="NETCDF4")
datasets[f"{year}-{varname}"] = filehandle
return datasets
def save_df_global_climate(root_directory_path, df_forest_fire):

varnames = ["ws", "vpd", "vap", "tmin", "tmax", "swe", "srad",
"soil", "q", "ppt", "pet", "def", "aet", "PDSI"]
datasets = download_global_climate_dataset(varnames)
values = get_data_country(df_forest_fire, varnames, datasets)
df_global_climate = pd.DataFrame(values)
df_global_climate[varname] =
df_global_climate[varname].astype(float, copy=True)
# Convertimos las temperaturas a kelvin

kelvin = 273.15
df_global_climate["tmin"] = df_global_climate["tmin"] + kelvin
df_global_climate["tmax"] = df_global_climate["tmax"] + kelvin
# Guardamos el dataset
df_global_climate['date'] =
pd.to_datetime(df_global_climate['date'])
df_global_climate =
df_global_climate.sort_values(by="date").dropna()
df_global_climate.to_pickle(f"{root_directory_path}/global_climate.pkl
")
Land cover data

The Intergovernmental Panel on Climate Change (IPCC) provides guidance on reporting areal
extent and change of land cover and land use, requiring the use of estimators that neither over
or underestimate dynamics to the degree possible, and that have known uncertainties. The
maps provided by GLAD do not have these properties. However, the maps can be leveraged to
facilitate appropriate probability-based statistical methods in deriving statistically valid areas of
forest extent and change. Specifically, the maps may be used as a stratifier in targeting forest
extent and/or change by a probability sample. The team at GLAD has demonstrated such
approaches using the GLAD forest loss data in sample-based area estimation (Tyukavina et al.,
ERL, 2018, Turubanova et al., ERL, 2019, and Potapov et al., RSE, 2019, among others).
Legend here
def download_land_cover_dataset(root_directory_path):
range_values = [(20, 80), (10, 80), (10, 70), ('00', 80), ('00',
70)]
for year in range(2000, 2021, 5):
for N, W in range_values:
remote_url =
f"https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-
2020/v2/{year}/{N}N_0{W}W.tif"
land_cover_path =
f"{root_directory_path}/land_cover_Colombia_{year}_{N}N_0{W}W.tif"
request.urlretrieve(remote_url, land_cover_path)
return range_values
def get_land_cover(lat_lon_array, land_cover_path):

with rasterio.open(land_cover_path) as src:
transform = src.transform
tif_crs = src.crs
transformer = Transformer.from_crs("epsg:4326", tif_crs,
always_xy=True)
lon_values, lat_values = lat_lon_array[:,1],
lat_lon_array[:,0]
x_coords, y_coords = transformer.transform(lon_values,
lat_values)
row, col = rasterio.transform.rowcol(transform, x_coords,
y_coords)
values = src.read(1)[row, col]
return lat_values, lon_values, values
def save_land_cover_values(values, name, root_directory_path):

land_cover_values = {'lat': [], 'lon': [], 'year': [],
'land_cover': []}
lat_lon_array = np.array(values)
for year in range(2000, 2021, 5):
land_cover_path =
f"{root_directory_path}/land_cover_Colombia_{year}_{name}.tif"
lat_values, lon_values, result = get_land_cover(lat_lon_array,
land_cover_path)
land_cover_values['lat'] += list(lat_values)
land_cover_values['lon'] += list(lon_values)
land_cover_values['year'] += list(np.full(len(lat_values),
year))
land_cover_values['land_cover'] += list(result)
os.remove(f"{root_directory_path}/land_cover_Colombia_{year}_{name}.ti
f")
df = pd.DataFrame(land_cover_values)
df.to_csv(f"{root_directory_path}/land_cover_Colombia_{name}.csv",
index=False)
def split_lat_lon_values(lat_lon_values, range_values,

root_directory_path):
limits = [
((10, 20), (-80, -70)),
((0, 10), (-80, -70)),
((0, 10), (-70, -60)),
((-10, 0), (-80, -70)),
((-10, 0), (-70, -60))
]
for limit_values, (N, W) in zip(limits, range_values):
((lat_min, lat_max), (lon_min, lon_max)), name = limit_values,
f"{N}N_0{W}W"
lat_lon_values_filtered = list(filter(lambda lat_lon: lat_min
< lat_lon[0] <= lat_max and lon_min <= lat_lon[1] < lon_max,
lat_lon_values))
save_land_cover_values(lat_lon_values_filtered, name,
root_directory_path)
def union_land_cover_data(range_values, root_directory_path):

df_land_cover = pd.DataFrame()
for N, W in range_values:
land_cover_csv_path =
f"{root_directory_path}/land_cover_Colombia_{N}N_0{W}W.csv"
df_tlc = pd.read_csv(land_cover_csv_path)
df_land_cover = pd.concat([df_land_cover, df_tlc])
os.remove(land_cover_csv_path)
return df_land_cover.sort_values(by="year").dropna()
def get_model(df_land_cover):
X = df_land_cover[['lat', 'lon', 'year']].values
y = df_land_cover['land_cover'].values
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2,

random_state=42)
model = DecisionTreeRegressor(max_depth=30, random_state=42)

model.fit(X_train, y_train)
return model
def save_df_land_cover_predicted(root_directory_path, model,
interpolated_values, extrapolated_values):
land_covers_interpolated = model.predict(interpolated_values)
land_covers_extrapolated = model.predict(extrapolated_values)
df_land_cover_predicted = pd.DataFrame({
'latitude': np.append(interpolated_values[:, 0],
extrapolated_values[:, 0]),
'longitude': np.append(interpolated_values[:, 1],
extrapolated_values[:, 1]),
'year': np.append(interpolated_values[:, 2],
extrapolated_values[:, 2]).astype(int),
'land_cover': np.append(land_covers_interpolated,
land_covers_extrapolated).astype(int)
}).sort_values(by="year")
df_land_cover_predicted.to_pickle(f"{root_directory_path}/land_cover.p
kl")
def download_land_cover_legend(root_directory_path):
remote_url = f"https://storage.googleapis.com/earthenginepartners-
hansen/GLCLU2000-2020/legend.xlsx"
land_cover_legend_path =
f"{root_directory_path}/land_cover_legend_Colombia.xlsx"
request.urlretrieve(remote_url, land_cover_legend_path)
return land_cover_legend_path
def set_values(df_land_cover_legend, column1, column2, indexes,

nan_indexes=[]):
for start_index, end_index in indexes:
total = end_index - start_index + 1
df_land_cover_legend.loc[np.linspace(start_index, end_index,
total), column1] = df_land_cover_legend.at[start_index, column2]
for nan_index in nan_indexes:

df_land_cover_legend.at[nan_index, column1] = np.NAN
def save_df_land_cover_legend(root_directory_path,
land_cover_legend_path):
df_land_cover_legend = pd.read_excel(land_cover_legend_path)
df_land_cover_legend = df_land_cover_legend.drop(columns={"Color
code"}).rename(columns={'Unnamed: 2': 'class'})
# Same column
set_values(df_land_cover_legend, 'General class', 'General class',
[(0, 96), (100, 196), (200, 207)], [97, 197, 208, 242, 245, 251, 255])
set_values(df_land_cover_legend, 'class', 'class', [(0, 1), (2,
18), (19, 24), (25, 48), (100, 101), (102, 118), (119, 124), (125,
148)], [49, 149])
# Other column
set_values(df_land_cover_legend, 'class', 'General class', [(200,
207), (241, 241), (244, 244), (250, 250), (254, 254)])
set_values(df_land_cover_legend, 'Sub-class', 'General class',
[(241, 241), (244, 244), (250, 250), (254, 254)])
# Replacing nan values

df_land_cover_legend = df_land_cover_legend.fillna("Not
registered")
df_land_cover_legend.to_pickle(f"{root_directory_path}/land_cover_lege
nd.pkl")
os.remove(land_cover_legend_path)
def save_df_land_cover(root_directory_path, df_forest_fire):

interpolated_values, extrapolated_values =
get_inter_extrapolated_values(df_forest_fire)
lat_lon_values = get_lat_lon_values(df_forest_fire)
range_values = download_land_cover_dataset(root_directory_path)
split_lat_lon_values(lat_lon_values, range_values,
df_land_cover = union_land_cover_data(range_values,
land_cover_legend_path =
download_land_cover_legend(root_directory_path)
model = get_model(df_land_cover)
save_df_land_cover_predicted(root_directory_path, model,
interpolated_values, extrapolated_values)
save_df_land_cover_legend(root_directory_path,
land_cover_legend_path)
Population Density Data

Estimated population density per grid-cell. The dataset is available to download in Geotiff and
ASCII XYZ format at a resolution of 30 arc (approximately 1km at the equator). The projection is
Geographic Coordinate System, WGS84. The units are number of people per square kilometre
based on country totals adjusted to match the corresponding official United Nations population
estimates that have been prepared by the Population Division of the Department of Economic
and Social Affairs of the United Nations Secretariat (2019 Revision of World Population
Prospects). The mapping approach is Random Forest-based dasymetric redistribution.

def download_population_density_dataset(root_directory_path):
remote_url =
f"https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1k
m/{year}/COL/col_pd_{year}_1km_ASCII_XYZ.zip"
local_file =
f"{root_directory_path}/population_density_Colombia_{year}.zip"
request.urlretrieve(remote_url, local_file)
with zipfile.ZipFile(local_file, 'r') as zip_ref:

zip_ref.extractall(root_directory_path)
os.remove(local_file)
def save_data(root_directory_path, df_forest_fire):

pd_path =
f"{root_directory_path}/col_pd_{year}_1km_ASCII_XYZ.csv"
if os.path.exists(pd_path):
df_pd = pd.read_csv(pd_path)
df_pd.rename(columns={'X': 'longitude', 'Y': 'latitude',
'Z': 'population_density'}, inplace=True)
# Filtramos por fecha

date_min, date_max = pd.to_datetime(f"{year}"),
pd.to_datetime(f"{year + 1}")
df_ff = df_forest_fire[(date_min <=
df_forest_fire['date']) & (df_forest_fire['date'] < date_max)]
lat_values, lon_values = df_ff['latitude'],
df_ff['longitude']
# Minimos
lat_min, lat_max = lat_values.min(), lat_values.max()
lon_min, lon_max = lon_values.min(), lon_values.max()
# Filtramos las latitudes

df_pd.sort_values(by="latitude")
df_pd = df_pd[lat_min <= df_pd['latitude']]
df_pd = df_pd[df_pd['latitude'] <= lat_max]
# Filtramos las longitudes

df_pd.sort_values(by="longitude")
df_pd = df_pd[lon_min <= df_pd['longitude']]
df_pd = df_pd[df_pd['longitude'] <= lon_max]
# Establecemos valores
df_pd.reset_index(drop=True, inplace=True)
lat, lon = df_pd['latitude'].to_numpy(),
df_pd['longitude'].to_numpy()
# Hallamos los valores

points = np.vstack((lat, lon)).T
tree = cKDTree(points)
query_points = np.vstack((lat_values, lon_values)).T
_, indices = tree.query(query_points)
population_density_values = df_pd.iloc[indices]
['population_density'].to_numpy()
# Guardamos los datos

df_population_density = pd.DataFrame({'latitude':
lat_values, 'longitude': lon_values,
'year':
np.full(len(lat_values), year), 'population_density':
population_density_values})
df_population_density.to_csv(f"{root_directory_path}/population_densit
y_Colombia_{year}.csv", index=False)
os.remove(pd_path)
def union_pd_data(root_directory_path):
df_population_density = pd.DataFrame()
df_ppd =
pd.read_csv(f"{root_directory_path}/population_density_Colombia_{year}
.csv")
df_population_density = pd.concat([df_population_density,
df_ppd])
os.remove(f"{root_directory_path}/population_density_Colombia_{year}.c
sv")
return df_population_density.sort_values(by="year").dropna()
def get_regressor(df_population_density):
X = df_population_density[['latitude', 'longitude',
'year']].values # Características: año, latitud y longitud
y = df_population_density['population_density'].values
# Densidad de población como variable dependiente
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3,

random_state=42)
regressor = DecisionTreeRegressor(max_depth=20, random_state=42)

regressor.fit(X_train, y_train)
return regressor
def save_df_population_density_predicted(root_directory_path,
df_population_density, regressor, extrapolated_values):
densities_predicted = regressor.predict(extrapolated_values)
df_pd_predicted = pd.DataFrame({
'latitude': extrapolated_values[:, 0],
'longitude': extrapolated_values[:, 1],
'year': extrapolated_values[:, 2].astype(int),
'population_density': densities_predicted
})
df_pd_predicted = pd.concat([df_population_density,
df_pd_predicted]).sort_values(by="year")
df_pd_predicted.to_pickle(f"{root_directory_path}/population_density.p
kl")
def save_df_population_density(root_directory_path, df_forest_fire):

download_population_density_dataset(root_directory_path)
save_data(root_directory_path, df_forest_fire)
_, extrapolated_values =
get_inter_extrapolated_values(df_forest_fire)
df_population_density = union_pd_data(root_directory_path)
regressor = get_regressor(df_population_density)
save_df_population_density_predicted(root_directory_path,
df_population_density, regressor, extrapolated_values)
Union Data
def read_values(root_directory_path):
df_ndvi = pd.read_pickle(f"{root_directory_path}/ndvi.pkl")
df_land_cover =
pd.read_pickle(f"{root_directory_path}/land_cover.pkl")
df_forest_fire =
pd.read_pickle(f"{root_directory_path}/forest_fire.pkl")
df_global_climate =
pd.read_pickle(f"{root_directory_path}/global_climate.pkl")
df_land_cover_legend =
pd.read_pickle(f"{root_directory_path}/land_cover_legend.pkl")
df_population_density =
pd.read_pickle(f"{root_directory_path}/population_density.pkl")
return df_ndvi, df_land_cover, df_forest_fire, df_global_climate,

df_land_cover_legend, df_population_density
def union_data(root_directory_path):
df_ndvi, df_land_cover, df_forest_fire, df_global_climate,
df_land_cover_legend, df_population_density =
read_values(root_directory_path)
df_forest_fire['date'] = pd.to_datetime(df_forest_fire['date'])
df_forest_fire['year'] =
df_forest_fire['date'].astype(str).str.slice(start=0,
stop=4).astype(int)
df_final = pd.merge(df_forest_fire, df_ndvi, on=['latitude',

'longitude', 'date'], how="left")
df_final_land_cover =
pd.merge(df_land_cover.rename(columns={'land_cover': 'Map value'}),
df_land_cover_legend, on=['Map value'], how="left").drop(columns=['Map
value'])
df_final = pd.merge(df_final, df_final_land_cover, on=['latitude',
'longitude', 'year'], how="left")
df_final = pd.merge(df_final, df_global_climate, on=['latitude',

'longitude', 'date'], how="left")
df_final = pd.merge(df_final, df_population_density,
on=['latitude', 'longitude', 'year'], how="left")
return df_final
Download data
def download_data(root_directory_path):
df_forest_fire = get_df_forest_fire(root_directory_path)
save_df_ndvi(root_directory_path, df_forest_fire.copy())
save_df_global_climate(root_directory_path, df_forest_fire.copy())
save_df_land_cover(root_directory_path, df_forest_fire.copy())
save_df_population_density(root_directory_path,
df_forest_fire.copy())
Save data
root_directory_path = get_root_directory()
download_data(root_directory_path)
df_final = union_data(root_directory_path)
df_final.head(5)
{"type":"dataframe","variable_name":"df_final"}
df_final.to_pickle(f"{root_directory_path}/final_dataset.pkl")

Proyecto Final Load Data

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Proyecto Final Load Data

Uploaded by

Copyright:

Available Formats

Libraries

pip install rasterio netCDF4 python-dotenv psycopg2

from sklearn.model_selection import train_test_split

return interpolated_values, extrapolated_values

For more information here

def get_forest_fire_archive_dataset(root_directory_path, code):

with zipfile.ZipFile(local_file, 'r') as zip_ref:

Included indicators are (for each dekad):

• 10 day NDVI (vim)

More information here

df_ndvi = pd.read_csv(local_file_ndvi_dataset, low_memory=False)

def get_ndvi_postal_codes(df_ndvi, root_directory_path):

def get_ndvi_lat_lon(geolocator, location):

def get_ndvi_by_values(geolocator, locations, municipality_fixed):

return get_ndvi_by_values(geolocator, values, municipality_fixed)

def union_ndvi_data(df_ndvi, result, lat_lon_ndvi):

def collect_ndvi_data(root_directory_path, merged_df_ndvi,

for year in range(2002, 2024):

# forest fire values

points = np.vstack((lat, lon, time)).T

for key in ['latitude', 'longitude', 'date']:

for key in ['n_pixels', 'vim', 'vim_avg', 'viq']:

def save_df_ndvi(root_directory_path, df_forest_fire):

Global Climate Data

More information here

return [lat_index, lon_index]

def get_indexes(data, points):

def get_data_by_date(varname, filehandle, time_values, lat_values,

# find indices of target lat/lon/day

[lat_index_min,lon_index_min] = check_latlon_bounds(lat, lon,

return list(data[time_indexes, lat_indexes,

def get_data_country(df_modis, varnames, datasets):

for year in range(2002, 2024):

date_values, lat_values, lon_values = df['date'],

values['date'] += [str(date_.date()) for date_ in date_values]

for varname in varnames:

def save_df_global_climate(root_directory_path, df_forest_fire):

# Convertimos las temperaturas a kelvin

Land cover data

More information here

def get_land_cover(lat_lon_array, land_cover_path):

return lat_values, lon_values, values

def save_land_cover_values(values, name, root_directory_path):

def split_lat_lon_values(lat_lon_values, range_values,

def union_land_cover_data(range_values, root_directory_path):

X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2,

model = DecisionTreeRegressor(max_depth=30, random_state=42)

def set_values(df_land_cover_legend, column1, column2, indexes,

for nan_index in nan_indexes:

# Replacing nan values

def save_df_land_cover(root_directory_path, df_forest_fire):

Population Density Data

More information here

with zipfile.ZipFile(local_file, 'r') as zip_ref:

def save_data(root_directory_path, df_forest_fire):

# Filtramos por fecha

# Filtramos las latitudes

# Filtramos las longitudes

# Hallamos los valores

# Guardamos los datos

X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3,

regressor = DecisionTreeRegressor(max_depth=20, random_state=42)

def save_df_population_density(root_directory_path, df_forest_fire):