Proyecto Final Load Data Compressed

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 18

Libraries

pip install rasterio netCDF4 python-dotenv psycopg2

Collecting rasterio
Downloading rasterio-1.3.10-cp310-cp310-manylinux2014_x86_64.whl
(21.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.5/21.5 MB 26.1 MB/s eta
0:00:00
anylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.5/5.5 MB 44.9 MB/s eta
0:00:00
ent already satisfied: psycopg2 in /usr/local/lib/python3.10/dist-
packages (2.9.9)
Collecting affine (from rasterio)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Requirement already satisfied: attrs in
/usr/local/lib/python3.10/dist-packages (from rasterio) (23.2.0)
Requirement already satisfied: certifi in
/usr/local/lib/python3.10/dist-packages (from rasterio) (2024.2.2)
Requirement already satisfied: click>=4.0 in
/usr/local/lib/python3.10/dist-packages (from rasterio) (8.1.7)
Requirement already satisfied: cligj>=0.5 in
/usr/local/lib/python3.10/dist-packages (from rasterio) (0.7.2)
Requirement already satisfied: numpy in
/usr/local/lib/python3.10/dist-packages (from rasterio) (1.25.2)
Collecting snuggs>=1.4.1 (from rasterio)
Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Requirement already satisfied: click-plugins in
/usr/local/lib/python3.10/dist-packages (from rasterio) (1.1.1)
Requirement already satisfied: setuptools in
/usr/local/lib/python3.10/dist-packages (from rasterio) (67.7.2)
Collecting cftime (from netCDF4)
Downloading cftime-1.6.3-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 14.0 MB/s eta
0:00:00
ent already satisfied: pyparsing>=2.1.6 in
/usr/local/lib/python3.10/dist-packages (from snuggs>=1.4.1->rasterio)
(3.1.2)
Installing collected packages: snuggs, python-dotenv, cftime, affine,
rasterio, netCDF4
Successfully installed affine-2.4.0 cftime-1.6.3 netCDF4-1.6.5 python-
dotenv-1.0.1 rasterio-1.3.10 snuggs-1.4.7

from sklearn.model_selection import train_test_split


from sklearn.tree import DecisionTreeRegressor
from sqlalchemy.orm import sessionmaker
from geopy.geocoders import Nominatim
from sqlalchemy import create_engine
from scipy.spatial import cKDTree
from pyproj import Transformer
from netCDF4 import Dataset
from sqlalchemy import text
from urllib import request
from datetime import date
import pandas as pd
import numpy as np
import rasterio
import psycopg2
import zipfile
import math
import os

Utils
def get_root_directory():
root_directory_path = "./datasets"

if not os.path.exists(root_directory_path):
os.makedirs(root_directory_path)

return root_directory_path

def get_lat_lon_values(df_forest_fire):
lat_values, lon_values = df_forest_fire['latitude'].to_numpy(),
df_forest_fire['longitude'].to_numpy()
lat_lon_values = list(set([(lat, lon) for lat, lon in
zip(lat_values, lon_values)]))

return lat_lon_values

def get_inter_extrapolated_values(df_forest_fire):
df_forest_fire['year'] =
df_forest_fire['date'].astype(str).str.slice(start=0,
stop=4).astype(int)
interpolated_values = df_forest_fire[df_forest_fire['year'] <=
2020][['latitude', 'longitude', 'year']].values
extrapolated_values = df_forest_fire[2021 <=
df_forest_fire['year']][['latitude', 'longitude', 'year']].values
df_forest_fire = df_forest_fire.drop(columns=['year'])

return interpolated_values, extrapolated_values


Forest Fire Data
Near real-time (NRT) Moderate Resolution Imaging Spectroradiometer (MODIS) Thermal
Anomalies / Fire locations - Collection 61 processed by NASA's Land, Atmosphere Near real-
time Capability for EO (LANCE) Fire Information for Resource Management System (FIRMS),
using swath products (MOD14/MYD14) rather than the tiled MOD14A1 and MYD14A1 products.
The thermal anomalies / active fire represent the center of a 1 km pixel that is flagged by the
MODIS MOD14/MYD14 Fire and Thermal Anomalies algorithm (Giglio 2003) as containing one
or more fires within the pixel. This is the most basic fire product in which active fires and other
thermal anomalies, such as volcanoes, are identified.

For more information here

def get_forest_fire_archive_dataset(root_directory_path, code):


remote_url =
f"https://firms.modaps.eosdis.nasa.gov/data/download/DL_FIRE_{code}.zi
p"
local_file = f"{root_directory_path}/forest_fire_Colombia.zip"
remote_name = f"{root_directory_path}/fire_archive_{code}.csv"
request.urlretrieve(remote_url, local_file)

with zipfile.ZipFile(local_file, 'r') as zip_ref:


zip_ref.extractall(root_directory_path)
df_forest_fire = pd.read_csv(remote_name)

os.remove(f"{root_directory_path}/fire_nrt_{code}.csv")
os.remove(f"{root_directory_path}/Readme.txt")
os.remove(remote_name)
os.remove(local_file)

return df_forest_fire

def download_forest_fire_dataset(root_directory_path):
df_viirs = get_forest_fire_archive_dataset(root_directory_path,
"SV-C2_457866")
df_modis = get_forest_fire_archive_dataset(root_directory_path,
"M-C61_457865")
df_forest_fire = pd.concat([df_viirs, df_modis])

return pd.merge(
df_forest_fire.sort_values(by="acq_date")
.rename(columns={"type": "fire_type", "acq_date": "date"})
.dropna(),
pd.DataFrame({
"fire_type": [0, 1, 2, 3],
"type": ["presumed vegetation fire", "active volcano",
"other static land source", "offshore"]
}), on="fire_type", how="left").drop(columns=["fire_type"])
def get_df_forest_fire(root_directory_path):
df_forest_fire = download_forest_fire_dataset(root_directory_path)
df_forest_fire.to_pickle(f"{root_directory_path}/forest_fire.pkl")
df_forest_fire['date'] = pd.to_datetime(df_forest_fire['date'])

return df_forest_fire

NDVI Data
This dataset contains dekadal NDVI indicators computed from NASA's Moderate Resolution
Imaging Spectroradiometer (MODIS) collection 6.1 from the Aqua and Terra satellite aggregated
by sub-national administrative units.

Included indicators are (for each dekad):

• 10 day NDVI (vim)


• NDVI long term average (vim_lta)
• 10 day NDVI anomaly [%] (viq)

The administrative units used for aggregation are based on WFP data and contain a Pcode
reference attributed to each unit. The number of input pixels used to create the aggregates, is
provided in the n_pixels column.

More information here

def download_ndvi_dataset(root_directory_path):
remote_url = "https://data.humdata.org/dataset/7f2ba5ba-8df1-41cf-
ab18-fc1da928a1e5/resource/c06298d9-0d4d-4e40-aecc-abc1da75dc4d/
download/col-ndvi-adm2-full.csv"
local_file_ndvi_dataset =
f"{root_directory_path}/ndvi_Colombia.csv"
request.urlretrieve(remote_url, local_file_ndvi_dataset)

df_ndvi = pd.read_csv(local_file_ndvi_dataset, low_memory=False)


df_ndvi = df_ndvi.drop(df_ndvi.index[0])
df_ndvi['date'] = pd.to_datetime(df_ndvi['date'])
df_ndvi = df_ndvi[df_ndvi['date'] <= pd.to_datetime("2023-12-31")]
os.remove(local_file_ndvi_dataset)

return df_ndvi

def get_ndvi_postal_codes(df_ndvi, root_directory_path):


postal_codes =
list(set(df_ndvi["ADM2_PCODE"].values.astype('str')))
postal_codes = np.array([postal_code.replace("CO", "") for
postal_code in postal_codes]).astype(int)

# postal codes
remote_url =
"https://www.datos.gov.co/api/views/ixig-z8b5/rows.csv?
accessType=DOWNLOAD"
postal_codes_path = f"{root_directory_path}/postal_codes.csv"
request.urlretrieve(remote_url, postal_codes_path)

column_name = "codigo_municipio"
df_postal_codes = pd.read_csv(postal_codes_path)

df_postal_codes[column_name] =
df_postal_codes[column_name].replace(',', '').astype(int)
df_postal_codes =
df_postal_codes.drop_duplicates(subset=column_name, keep='first')

result =
df_postal_codes[df_postal_codes[column_name].isin(postal_codes)]
[['nombre_departamento', 'nombre_municipio', 'codigo_municipio',
'codigo_postal']]
result.reset_index(drop=True, inplace=True)
result.sort_values(by="codigo_postal")
os.remove(postal_codes_path)

return result

def get_ndvi_lat_lon(geolocator, location):


try:
lat_lon_result = geolocator.geocode(location, timeout=10)
return (lat_lon_result.latitude, lat_lon_result.longitude) if
lat_lon_result else (None, None)
except:
return (None, None)

def get_ndvi_by_values(geolocator, locations, municipality_fixed):


lat_lon_ndvi = np.zeros((len(locations), 2))
for index, [municipality, department] in enumerate(locations):
municipality = municipality_fixed[municipality] if
municipality in municipality_fixed else municipality
location = get_ndvi_lat_lon(geolocator, f"{municipality},
{department}, COLOMBIA")
lat_lon_ndvi[index] = np.array(location)

return lat_lon_ndvi

def get_ndvi_lat_lon_values(result):
values = result[['nombre_municipio',
'nombre_departamento']].values
geolocator = Nominatim(user_agent="ndvi_data")
municipality_fixed = {
"VILLA DE SAN DIEGO DE UBATE": "UBATE",
"CERRO SAN ANTONIO": "SAN ANTONIO",
"SAN JUAN DE RIO SECO": "SAN JUAN DE RIOSECO",
"TOLU VIEJO": "TOLUVIEJO",
"SAN ANDRES DE TUMACO": "TUMACO",
"EL CANTON DEL SAN PABLO": "EL CANTON DE SAN PABLO",
"SAN LUIS DE SINCE": "SINCE",
"SAN JOSE DE ALBAN": "ALBAN"
}

return get_ndvi_by_values(geolocator, values, municipality_fixed)

def union_ndvi_data(df_ndvi, result, lat_lon_ndvi):


result['latitude'] = lat_lon_ndvi[:, 0]
result['longitude'] = lat_lon_ndvi[:, 1]

result.rename(columns={'codigo_municipio': 'ADM2_PCODE'},
inplace=True)
result['ADM2_PCODE'] = 'CO' + result['ADM2_PCODE'].astype(str)
merged_df_ndvi = pd.merge(df_ndvi, result[['latitude',
'longitude', 'ADM2_PCODE']], on='ADM2_PCODE', how='left')
merged_df_ndvi = merged_df_ndvi.drop(columns={'adm2_id',
'ADM2_PCODE'})

return merged_df_ndvi

def collect_ndvi_data(root_directory_path, merged_df_ndvi,


df_forest_fire):
values = {key: [] for key in merged_df_ndvi.columns}
merged_df_ndvi = merged_df_ndvi.dropna()

for year in range(2002, 2024):


# Filtramos los datos
date_min, date_max = pd.to_datetime(f'{year}-01-01'),
pd.to_datetime(f'{year}-12-31')
df_ndvi_temp = merged_df_ndvi[(date_min <=
merged_df_ndvi['date']) & (merged_df_ndvi['date'] <= date_max)]
df_forest_fire_temp = df_forest_fire[(date_min <=
df_forest_fire['date']) & (df_forest_fire['date'] <= date_max)]

df_ndvi_temp.reset_index(drop=True, inplace=True)
df_forest_fire_temp.reset_index(drop=True, inplace=True)
init_date = pd.to_datetime(f'{year}-01-01')

# forest fire values


lat_values = df_forest_fire_temp['latitude'].values
lon_values = df_forest_fire_temp['longitude'].values
time_values = (df_forest_fire_temp['date'] -
init_date).dt.days.values

# ndvi values
lat = df_ndvi_temp['latitude'].values
lon = df_ndvi_temp['longitude'].values
time = (df_ndvi_temp['date'] - init_date).dt.days.values

points = np.vstack((lat, lon, time)).T


tree = cKDTree(points)
query_points = np.vstack((lat_values, lon_values,
time_values)).T
_, indexes = tree.query(query_points)

for key in ['latitude', 'longitude', 'date']:


values[key] += list(df_forest_fire_temp[key].values)

for key in ['n_pixels', 'vim', 'vim_avg', 'viq']:


values[key] = np.append(values[key],
df_ndvi_temp.iloc[indexes][key].values).astype(float)

df_ndvi = pd.DataFrame(values).sort_values(by="date").dropna()
df_ndvi.to_pickle(f"{root_directory_path}/ndvi.pkl")

def save_df_ndvi(root_directory_path, df_forest_fire):


df_ndvi = download_ndvi_dataset(root_directory_path)
result = get_ndvi_postal_codes(df_ndvi, root_directory_path)

lat_lon_ndvi = get_ndvi_lat_lon_values(result)
merged_df_ndvi = union_ndvi_data(df_ndvi, result, lat_lon_ndvi)

collect_ndvi_data(root_directory_path, merged_df_ndvi,
df_forest_fire)

Global Climate Data


TerraClimate is a dataset of monthly climate and climatic water balance for global terrestrial
surfaces from 1958-2019. These data provide important inputs for ecological and hydrological
studies at global scales that require high spatial resolution and time-varying data. All data have
monthly temporal resolution and a ~4-km (1/24th degree) spatial resolution. The data cover the
period from 1958-2020. We plan to update these data periodically (annually).

More information here

def
check_latlon_bounds(lat,lon,lat_index,lon_index,lat_target,lon_target)
:
#check final indices are in right bounds
if(lat[lat_index]>lat_target):
if(lat_index!=0):
lat_index = lat_index - 1
if(lat[lat_index]<lat_target):
if(lat_index!=len(lat)):
lat_index = lat_index +1
if(lon[lon_index]>lon_target):
if(lon_index!=0):
lon_index = lon_index - 1
if(lon[lon_index]<lon_target):
if(lon_index!=len(lon)):
lon_index = lon_index + 1

return [lat_index, lon_index]

def get_indexes(data, points):


data_reshaped = data.filled().reshape(-1, 1)
tree = cKDTree(data_reshaped)
query_points = points.to_numpy().reshape(-1, 1)
_, indexes = tree.query(query_points)

return indexes

def get_data_by_date(varname, filehandle, time_values, lat_values,


lon_values, year, lat_min, lon_min, lat_max, lon_max):
# subset in space (lat/lon)
lathandle = filehandle.variables['lat']
lonhandle = filehandle.variables['lon']
lat=lathandle[:]
lon=lonhandle[:]

# find indices of target lat/lon/day


lat_index_min = (np.abs(lat-lat_min)).argmin()
lat_index_max = (np.abs(lat-lat_max)).argmin()
lon_index_min = (np.abs(lon-lon_min)).argmin()
lon_index_max = (np.abs(lon-lon_max)).argmin()

[lat_index_min,lon_index_min] = check_latlon_bounds(lat, lon,


lat_index_min, lon_index_min, lat_min, lon_min)
[lat_index_max,lon_index_max] = check_latlon_bounds(lat, lon,
lat_index_max, lon_index_max, lon_max, lon_max)

if(lat_index_min>lat_index_max):
lat_index_range = range(lat_index_max, lat_index_min+1)
else:
lat_index_range = range(lat_index_min, lat_index_max+1)
if(lon_index_min>lon_index_max):
lon_index_range = range(lon_index_max, lon_index_min+1)
else:
lon_index_range = range(lon_index_min, lon_index_max+1)

lat=lat[lat_index_range]
lon=lon[lon_index_range]

# subset in time
timehandle=filehandle.variables['time']
time=timehandle[:]
time_min = (date(year,1,1)-date(1900,1,1)).days
time_max = (date(year,12,31)-date(1900,1,1)).days
time_index_min = (np.abs(time-time_min)).argmin()
time_index_max = (np.abs(time-time_max)).argmin()
time_index_range = range(time_index_min, time_index_max+1)
time = timehandle[time_index_range]

# subset data
datahandle = filehandle.variables[varname]
data = datahandle[time_index_range, lat_index_range,
lon_index_range]

# Indexes
time_indexes = get_indexes(time, time_values)
lat_indexes = get_indexes(lat, lat_values)
lon_indexes = get_indexes(lon, lon_values)

return list(data[time_indexes, lat_indexes,


lon_indexes].filled(np.nan))

def get_data_country(df_modis, varnames, datasets):


values = {varname: [] for varname in ["date", "latitude",
"longitude"] + varnames}
df_modis["date"] = pd.to_datetime(df_modis["date"])
df_modis = df_modis.sort_values(by="date")

for year in range(2002, 2024):


df = df_modis[df_modis["date"] <= pd.to_datetime(f"{year}-12-
31")]
df = df[pd.to_datetime(f"{year}-01-01") <= df["date"]]

date_values, lat_values, lon_values = df['date'],


df['latitude'], df['longitude']
lat_min, lon_min = lat_values.min(), lon_values.min()
lat_max, lon_max = lat_values.max(), lon_values.max()

values['date'] += [str(date_.date()) for date_ in date_values]


values['latitude'] += list(lat_values.values)
values['longitude'] += list(lon_values.values)
time_values = (date_values - pd.to_datetime("1900-01-
01")).dt.days

for varname in varnames:


filehandle = datasets[f"{year}-{varname}"]
values[varname] += get_data_by_date(varname, filehandle,
time_values, lat_values, lon_values, year, lat_min, lon_min, lat_max,
lon_max)
return values

def download_global_climate_dataset(varnames):
datasets = {}
for year in range(2002, 2024):
for varname in varnames:
pathname =
f"http://thredds.northwestknowledge.net:8080/thredds/dodsC/TERRACLIMAT
E_ALL/data/TerraClimate_{varname}_{year}.nc"
filehandle = Dataset(pathname, 'r', format="NETCDF4")
datasets[f"{year}-{varname}"] = filehandle

return datasets

def save_df_global_climate(root_directory_path, df_forest_fire):


varnames = ["ws", "vpd", "vap", "tmin", "tmax", "swe", "srad",
"soil", "q", "ppt", "pet", "def", "aet", "PDSI"]
datasets = download_global_climate_dataset(varnames)
values = get_data_country(df_forest_fire, varnames, datasets)
df_global_climate = pd.DataFrame(values)
for varname in varnames:
df_global_climate[varname] =
df_global_climate[varname].astype(float, copy=True)

# Convertimos las temperaturas a kelvin


kelvin = 273.15
df_global_climate["tmin"] = df_global_climate["tmin"] + kelvin
df_global_climate["tmax"] = df_global_climate["tmax"] + kelvin

# Guardamos el dataset
df_global_climate['date'] =
pd.to_datetime(df_global_climate['date'])
df_global_climate =
df_global_climate.sort_values(by="date").dropna()

df_global_climate.to_pickle(f"{root_directory_path}/global_climate.pkl
")

Land cover data


The Intergovernmental Panel on Climate Change (IPCC) provides guidance on reporting areal
extent and change of land cover and land use, requiring the use of estimators that neither over
or underestimate dynamics to the degree possible, and that have known uncertainties. The
maps provided by GLAD do not have these properties. However, the maps can be leveraged to
facilitate appropriate probability-based statistical methods in deriving statistically valid areas of
forest extent and change. Specifically, the maps may be used as a stratifier in targeting forest
extent and/or change by a probability sample. The team at GLAD has demonstrated such
approaches using the GLAD forest loss data in sample-based area estimation (Tyukavina et al.,
ERL, 2018, Turubanova et al., ERL, 2019, and Potapov et al., RSE, 2019, among others).

More information here

Legend here

def download_land_cover_dataset(root_directory_path):
range_values = [(20, 80), (10, 80), (10, 70), ('00', 80), ('00',
70)]
for year in range(2000, 2021, 5):
for N, W in range_values:
remote_url =
f"https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-
2020/v2/{year}/{N}N_0{W}W.tif"
land_cover_path =
f"{root_directory_path}/land_cover_Colombia_{year}_{N}N_0{W}W.tif"
request.urlretrieve(remote_url, land_cover_path)

return range_values

def get_land_cover(lat_lon_array, land_cover_path):


with rasterio.open(land_cover_path) as src:
transform = src.transform
tif_crs = src.crs
transformer = Transformer.from_crs("epsg:4326", tif_crs,
always_xy=True)
lon_values, lat_values = lat_lon_array[:,1],
lat_lon_array[:,0]
x_coords, y_coords = transformer.transform(lon_values,
lat_values)
row, col = rasterio.transform.rowcol(transform, x_coords,
y_coords)
values = src.read(1)[row, col]

return lat_values, lon_values, values

def save_land_cover_values(values, name, root_directory_path):


land_cover_values = {'lat': [], 'lon': [], 'year': [],
'land_cover': []}
lat_lon_array = np.array(values)
for year in range(2000, 2021, 5):
land_cover_path =
f"{root_directory_path}/land_cover_Colombia_{year}_{name}.tif"
lat_values, lon_values, result = get_land_cover(lat_lon_array,
land_cover_path)

land_cover_values['lat'] += list(lat_values)
land_cover_values['lon'] += list(lon_values)
land_cover_values['year'] += list(np.full(len(lat_values),
year))
land_cover_values['land_cover'] += list(result)

os.remove(f"{root_directory_path}/land_cover_Colombia_{year}_{name}.ti
f")

df = pd.DataFrame(land_cover_values)
df.to_csv(f"{root_directory_path}/land_cover_Colombia_{name}.csv",
index=False)

def split_lat_lon_values(lat_lon_values, range_values,


root_directory_path):
limits = [
((10, 20), (-80, -70)),
((0, 10), (-80, -70)),
((0, 10), (-70, -60)),
((-10, 0), (-80, -70)),
((-10, 0), (-70, -60))
]
for limit_values, (N, W) in zip(limits, range_values):
((lat_min, lat_max), (lon_min, lon_max)), name = limit_values,
f"{N}N_0{W}W"
lat_lon_values_filtered = list(filter(lambda lat_lon: lat_min
< lat_lon[0] <= lat_max and lon_min <= lat_lon[1] < lon_max,
lat_lon_values))
save_land_cover_values(lat_lon_values_filtered, name,
root_directory_path)

def union_land_cover_data(range_values, root_directory_path):


df_land_cover = pd.DataFrame()
for N, W in range_values:
land_cover_csv_path =
f"{root_directory_path}/land_cover_Colombia_{N}N_0{W}W.csv"
df_tlc = pd.read_csv(land_cover_csv_path)
df_land_cover = pd.concat([df_land_cover, df_tlc])
os.remove(land_cover_csv_path)

return df_land_cover.sort_values(by="year").dropna()

def get_model(df_land_cover):
X = df_land_cover[['lat', 'lon', 'year']].values
y = df_land_cover['land_cover'].values

X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2,


random_state=42)

model = DecisionTreeRegressor(max_depth=30, random_state=42)


model.fit(X_train, y_train)

return model
def save_df_land_cover_predicted(root_directory_path, model,
interpolated_values, extrapolated_values):
land_covers_interpolated = model.predict(interpolated_values)
land_covers_extrapolated = model.predict(extrapolated_values)

df_land_cover_predicted = pd.DataFrame({
'latitude': np.append(interpolated_values[:, 0],
extrapolated_values[:, 0]),
'longitude': np.append(interpolated_values[:, 1],
extrapolated_values[:, 1]),
'year': np.append(interpolated_values[:, 2],
extrapolated_values[:, 2]).astype(int),
'land_cover': np.append(land_covers_interpolated,
land_covers_extrapolated).astype(int)
}).sort_values(by="year")

df_land_cover_predicted.to_pickle(f"{root_directory_path}/land_cover.p
kl")

def download_land_cover_legend(root_directory_path):
remote_url = f"https://storage.googleapis.com/earthenginepartners-
hansen/GLCLU2000-2020/legend.xlsx"
land_cover_legend_path =
f"{root_directory_path}/land_cover_legend_Colombia.xlsx"
request.urlretrieve(remote_url, land_cover_legend_path)

return land_cover_legend_path

def set_values(df_land_cover_legend, column1, column2, indexes,


nan_indexes=[]):
for start_index, end_index in indexes:
total = end_index - start_index + 1
df_land_cover_legend.loc[np.linspace(start_index, end_index,
total), column1] = df_land_cover_legend.at[start_index, column2]

for nan_index in nan_indexes:


df_land_cover_legend.at[nan_index, column1] = np.NAN

def save_df_land_cover_legend(root_directory_path,
land_cover_legend_path):
df_land_cover_legend = pd.read_excel(land_cover_legend_path)
df_land_cover_legend = df_land_cover_legend.drop(columns={"Color
code"}).rename(columns={'Unnamed: 2': 'class'})

# Same column
set_values(df_land_cover_legend, 'General class', 'General class',
[(0, 96), (100, 196), (200, 207)], [97, 197, 208, 242, 245, 251, 255])
set_values(df_land_cover_legend, 'class', 'class', [(0, 1), (2,
18), (19, 24), (25, 48), (100, 101), (102, 118), (119, 124), (125,
148)], [49, 149])

# Other column
set_values(df_land_cover_legend, 'class', 'General class', [(200,
207), (241, 241), (244, 244), (250, 250), (254, 254)])
set_values(df_land_cover_legend, 'Sub-class', 'General class',
[(241, 241), (244, 244), (250, 250), (254, 254)])

# Replacing nan values


df_land_cover_legend = df_land_cover_legend.fillna("Not
registered")

df_land_cover_legend.to_pickle(f"{root_directory_path}/land_cover_lege
nd.pkl")
os.remove(land_cover_legend_path)

def save_df_land_cover(root_directory_path, df_forest_fire):


interpolated_values, extrapolated_values =
get_inter_extrapolated_values(df_forest_fire)
lat_lon_values = get_lat_lon_values(df_forest_fire)

range_values = download_land_cover_dataset(root_directory_path)
split_lat_lon_values(lat_lon_values, range_values,
root_directory_path)

df_land_cover = union_land_cover_data(range_values,
root_directory_path)
land_cover_legend_path =
download_land_cover_legend(root_directory_path)
model = get_model(df_land_cover)

save_df_land_cover_predicted(root_directory_path, model,
interpolated_values, extrapolated_values)
save_df_land_cover_legend(root_directory_path,
land_cover_legend_path)

Population Density Data


Estimated population density per grid-cell. The dataset is available to download in Geotiff and
ASCII XYZ format at a resolution of 30 arc (approximately 1km at the equator). The projection is
Geographic Coordinate System, WGS84. The units are number of people per square kilometre
based on country totals adjusted to match the corresponding official United Nations population
estimates that have been prepared by the Population Division of the Department of Economic
and Social Affairs of the United Nations Secretariat (2019 Revision of World Population
Prospects). The mapping approach is Random Forest-based dasymetric redistribution.

More information here


def download_population_density_dataset(root_directory_path):
for year in range(2002, 2021):
remote_url =
f"https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1k
m/{year}/COL/col_pd_{year}_1km_ASCII_XYZ.zip"
local_file =
f"{root_directory_path}/population_density_Colombia_{year}.zip"
request.urlretrieve(remote_url, local_file)

with zipfile.ZipFile(local_file, 'r') as zip_ref:


zip_ref.extractall(root_directory_path)
os.remove(local_file)

def save_data(root_directory_path, df_forest_fire):


for year in range(2002, 2021):
pd_path =
f"{root_directory_path}/col_pd_{year}_1km_ASCII_XYZ.csv"
if os.path.exists(pd_path):
df_pd = pd.read_csv(pd_path)
df_pd.rename(columns={'X': 'longitude', 'Y': 'latitude',
'Z': 'population_density'}, inplace=True)

# Filtramos por fecha


date_min, date_max = pd.to_datetime(f"{year}"),
pd.to_datetime(f"{year + 1}")
df_ff = df_forest_fire[(date_min <=
df_forest_fire['date']) & (df_forest_fire['date'] < date_max)]
lat_values, lon_values = df_ff['latitude'],
df_ff['longitude']

# Minimos
lat_min, lat_max = lat_values.min(), lat_values.max()
lon_min, lon_max = lon_values.min(), lon_values.max()

# Filtramos las latitudes


df_pd.sort_values(by="latitude")
df_pd = df_pd[lat_min <= df_pd['latitude']]
df_pd = df_pd[df_pd['latitude'] <= lat_max]

# Filtramos las longitudes


df_pd.sort_values(by="longitude")
df_pd = df_pd[lon_min <= df_pd['longitude']]
df_pd = df_pd[df_pd['longitude'] <= lon_max]

# Establecemos valores
df_pd.reset_index(drop=True, inplace=True)
lat, lon = df_pd['latitude'].to_numpy(),
df_pd['longitude'].to_numpy()

# Hallamos los valores


points = np.vstack((lat, lon)).T
tree = cKDTree(points)
query_points = np.vstack((lat_values, lon_values)).T
_, indices = tree.query(query_points)
population_density_values = df_pd.iloc[indices]
['population_density'].to_numpy()

# Guardamos los datos


df_population_density = pd.DataFrame({'latitude':
lat_values, 'longitude': lon_values,
'year':
np.full(len(lat_values), year), 'population_density':
population_density_values})

df_population_density.to_csv(f"{root_directory_path}/population_densit
y_Colombia_{year}.csv", index=False)
os.remove(pd_path)

def union_pd_data(root_directory_path):
df_population_density = pd.DataFrame()
for year in range(2002, 2021):
df_ppd =
pd.read_csv(f"{root_directory_path}/population_density_Colombia_{year}
.csv")
df_population_density = pd.concat([df_population_density,
df_ppd])

os.remove(f"{root_directory_path}/population_density_Colombia_{year}.c
sv")

return df_population_density.sort_values(by="year").dropna()

def get_regressor(df_population_density):
X = df_population_density[['latitude', 'longitude',
'year']].values # Características: año, latitud y longitud
y = df_population_density['population_density'].values
# Densidad de población como variable dependiente

X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3,


random_state=42)

regressor = DecisionTreeRegressor(max_depth=20, random_state=42)


regressor.fit(X_train, y_train)

return regressor

def save_df_population_density_predicted(root_directory_path,
df_population_density, regressor, extrapolated_values):
densities_predicted = regressor.predict(extrapolated_values)
df_pd_predicted = pd.DataFrame({
'latitude': extrapolated_values[:, 0],
'longitude': extrapolated_values[:, 1],
'year': extrapolated_values[:, 2].astype(int),
'population_density': densities_predicted
})

df_pd_predicted = pd.concat([df_population_density,
df_pd_predicted]).sort_values(by="year")

df_pd_predicted.to_pickle(f"{root_directory_path}/population_density.p
kl")

def save_df_population_density(root_directory_path, df_forest_fire):


download_population_density_dataset(root_directory_path)
save_data(root_directory_path, df_forest_fire)

_, extrapolated_values =
get_inter_extrapolated_values(df_forest_fire)
df_population_density = union_pd_data(root_directory_path)
regressor = get_regressor(df_population_density)

save_df_population_density_predicted(root_directory_path,
df_population_density, regressor, extrapolated_values)

Union Data
def read_values(root_directory_path):
df_ndvi = pd.read_pickle(f"{root_directory_path}/ndvi.pkl")
df_land_cover =
pd.read_pickle(f"{root_directory_path}/land_cover.pkl")

df_forest_fire =
pd.read_pickle(f"{root_directory_path}/forest_fire.pkl")
df_global_climate =
pd.read_pickle(f"{root_directory_path}/global_climate.pkl")

df_land_cover_legend =
pd.read_pickle(f"{root_directory_path}/land_cover_legend.pkl")
df_population_density =
pd.read_pickle(f"{root_directory_path}/population_density.pkl")

return df_ndvi, df_land_cover, df_forest_fire, df_global_climate,


df_land_cover_legend, df_population_density

def union_data(root_directory_path):
df_ndvi, df_land_cover, df_forest_fire, df_global_climate,
df_land_cover_legend, df_population_density =
read_values(root_directory_path)
df_forest_fire['date'] = pd.to_datetime(df_forest_fire['date'])
df_forest_fire['year'] =
df_forest_fire['date'].astype(str).str.slice(start=0,
stop=4).astype(int)

df_final = pd.merge(df_forest_fire, df_ndvi, on=['latitude',


'longitude', 'date'], how="left")
df_final_land_cover =
pd.merge(df_land_cover.rename(columns={'land_cover': 'Map value'}),
df_land_cover_legend, on=['Map value'], how="left").drop(columns=['Map
value'])
df_final = pd.merge(df_final, df_final_land_cover, on=['latitude',
'longitude', 'year'], how="left")

df_final = pd.merge(df_final, df_global_climate, on=['latitude',


'longitude', 'date'], how="left")
df_final = pd.merge(df_final, df_population_density,
on=['latitude', 'longitude', 'year'], how="left")

return df_final

Download data
def download_data(root_directory_path):
df_forest_fire = get_df_forest_fire(root_directory_path)

save_df_ndvi(root_directory_path, df_forest_fire.copy())
save_df_global_climate(root_directory_path, df_forest_fire.copy())

save_df_land_cover(root_directory_path, df_forest_fire.copy())
save_df_population_density(root_directory_path,
df_forest_fire.copy())

Save data
root_directory_path = get_root_directory()

download_data(root_directory_path)

df_final = union_data(root_directory_path)
df_final.head(5)

{"type":"dataframe","variable_name":"df_final"}

df_final.to_pickle(f"{root_directory_path}/final_dataset.pkl")

You might also like