Professional Documents
Culture Documents
Proyecto Final Load Data
Proyecto Final Load Data
Collecting rasterio
Downloading rasterio-1.3.10-cp310-cp310-manylinux2014_x86_64.whl
(21.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.5/21.5 MB 26.1 MB/s eta
0:00:00
anylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.5/5.5 MB 44.9 MB/s eta
0:00:00
ent already satisfied: psycopg2 in /usr/local/lib/python3.10/dist-
packages (2.9.9)
Collecting affine (from rasterio)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Requirement already satisfied: attrs in
/usr/local/lib/python3.10/dist-packages (from rasterio) (23.2.0)
Requirement already satisfied: certifi in
/usr/local/lib/python3.10/dist-packages (from rasterio) (2024.2.2)
Requirement already satisfied: click>=4.0 in
/usr/local/lib/python3.10/dist-packages (from rasterio) (8.1.7)
Requirement already satisfied: cligj>=0.5 in
/usr/local/lib/python3.10/dist-packages (from rasterio) (0.7.2)
Requirement already satisfied: numpy in
/usr/local/lib/python3.10/dist-packages (from rasterio) (1.25.2)
Collecting snuggs>=1.4.1 (from rasterio)
Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Requirement already satisfied: click-plugins in
/usr/local/lib/python3.10/dist-packages (from rasterio) (1.1.1)
Requirement already satisfied: setuptools in
/usr/local/lib/python3.10/dist-packages (from rasterio) (67.7.2)
Collecting cftime (from netCDF4)
Downloading cftime-1.6.3-cp310-cp310-
manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 14.0 MB/s eta
0:00:00
ent already satisfied: pyparsing>=2.1.6 in
/usr/local/lib/python3.10/dist-packages (from snuggs>=1.4.1->rasterio)
(3.1.2)
Installing collected packages: snuggs, python-dotenv, cftime, affine,
rasterio, netCDF4
Successfully installed affine-2.4.0 cftime-1.6.3 netCDF4-1.6.5 python-
dotenv-1.0.1 rasterio-1.3.10 snuggs-1.4.7
Utils
def get_root_directory():
root_directory_path = "./datasets"
if not os.path.exists(root_directory_path):
os.makedirs(root_directory_path)
return root_directory_path
def get_lat_lon_values(df_forest_fire):
lat_values, lon_values = df_forest_fire['latitude'].to_numpy(),
df_forest_fire['longitude'].to_numpy()
lat_lon_values = list(set([(lat, lon) for lat, lon in
zip(lat_values, lon_values)]))
return lat_lon_values
def get_inter_extrapolated_values(df_forest_fire):
df_forest_fire['year'] =
df_forest_fire['date'].astype(str).str.slice(start=0,
stop=4).astype(int)
interpolated_values = df_forest_fire[df_forest_fire['year'] <=
2020][['latitude', 'longitude', 'year']].values
extrapolated_values = df_forest_fire[2021 <=
df_forest_fire['year']][['latitude', 'longitude', 'year']].values
df_forest_fire = df_forest_fire.drop(columns=['year'])
os.remove(f"{root_directory_path}/fire_nrt_{code}.csv")
os.remove(f"{root_directory_path}/Readme.txt")
os.remove(remote_name)
os.remove(local_file)
return df_forest_fire
def download_forest_fire_dataset(root_directory_path):
df_viirs = get_forest_fire_archive_dataset(root_directory_path,
"SV-C2_457866")
df_modis = get_forest_fire_archive_dataset(root_directory_path,
"M-C61_457865")
df_forest_fire = pd.concat([df_viirs, df_modis])
return pd.merge(
df_forest_fire.sort_values(by="acq_date")
.rename(columns={"type": "fire_type", "acq_date": "date"})
.dropna(),
pd.DataFrame({
"fire_type": [0, 1, 2, 3],
"type": ["presumed vegetation fire", "active volcano",
"other static land source", "offshore"]
}), on="fire_type", how="left").drop(columns=["fire_type"])
def get_df_forest_fire(root_directory_path):
df_forest_fire = download_forest_fire_dataset(root_directory_path)
df_forest_fire.to_pickle(f"{root_directory_path}/forest_fire.pkl")
df_forest_fire['date'] = pd.to_datetime(df_forest_fire['date'])
return df_forest_fire
NDVI Data
This dataset contains dekadal NDVI indicators computed from NASA's Moderate Resolution
Imaging Spectroradiometer (MODIS) collection 6.1 from the Aqua and Terra satellite aggregated
by sub-national administrative units.
The administrative units used for aggregation are based on WFP data and contain a Pcode
reference attributed to each unit. The number of input pixels used to create the aggregates, is
provided in the n_pixels column.
def download_ndvi_dataset(root_directory_path):
remote_url = "https://data.humdata.org/dataset/7f2ba5ba-8df1-41cf-
ab18-fc1da928a1e5/resource/c06298d9-0d4d-4e40-aecc-abc1da75dc4d/
download/col-ndvi-adm2-full.csv"
local_file_ndvi_dataset =
f"{root_directory_path}/ndvi_Colombia.csv"
request.urlretrieve(remote_url, local_file_ndvi_dataset)
return df_ndvi
# postal codes
remote_url =
"https://www.datos.gov.co/api/views/ixig-z8b5/rows.csv?
accessType=DOWNLOAD"
postal_codes_path = f"{root_directory_path}/postal_codes.csv"
request.urlretrieve(remote_url, postal_codes_path)
column_name = "codigo_municipio"
df_postal_codes = pd.read_csv(postal_codes_path)
df_postal_codes[column_name] =
df_postal_codes[column_name].replace(',', '').astype(int)
df_postal_codes =
df_postal_codes.drop_duplicates(subset=column_name, keep='first')
result =
df_postal_codes[df_postal_codes[column_name].isin(postal_codes)]
[['nombre_departamento', 'nombre_municipio', 'codigo_municipio',
'codigo_postal']]
result.reset_index(drop=True, inplace=True)
result.sort_values(by="codigo_postal")
os.remove(postal_codes_path)
return result
return lat_lon_ndvi
def get_ndvi_lat_lon_values(result):
values = result[['nombre_municipio',
'nombre_departamento']].values
geolocator = Nominatim(user_agent="ndvi_data")
municipality_fixed = {
"VILLA DE SAN DIEGO DE UBATE": "UBATE",
"CERRO SAN ANTONIO": "SAN ANTONIO",
"SAN JUAN DE RIO SECO": "SAN JUAN DE RIOSECO",
"TOLU VIEJO": "TOLUVIEJO",
"SAN ANDRES DE TUMACO": "TUMACO",
"EL CANTON DEL SAN PABLO": "EL CANTON DE SAN PABLO",
"SAN LUIS DE SINCE": "SINCE",
"SAN JOSE DE ALBAN": "ALBAN"
}
result.rename(columns={'codigo_municipio': 'ADM2_PCODE'},
inplace=True)
result['ADM2_PCODE'] = 'CO' + result['ADM2_PCODE'].astype(str)
merged_df_ndvi = pd.merge(df_ndvi, result[['latitude',
'longitude', 'ADM2_PCODE']], on='ADM2_PCODE', how='left')
merged_df_ndvi = merged_df_ndvi.drop(columns={'adm2_id',
'ADM2_PCODE'})
return merged_df_ndvi
df_ndvi_temp.reset_index(drop=True, inplace=True)
df_forest_fire_temp.reset_index(drop=True, inplace=True)
init_date = pd.to_datetime(f'{year}-01-01')
# ndvi values
lat = df_ndvi_temp['latitude'].values
lon = df_ndvi_temp['longitude'].values
time = (df_ndvi_temp['date'] - init_date).dt.days.values
df_ndvi = pd.DataFrame(values).sort_values(by="date").dropna()
df_ndvi.to_pickle(f"{root_directory_path}/ndvi.pkl")
lat_lon_ndvi = get_ndvi_lat_lon_values(result)
merged_df_ndvi = union_ndvi_data(df_ndvi, result, lat_lon_ndvi)
collect_ndvi_data(root_directory_path, merged_df_ndvi,
df_forest_fire)
def
check_latlon_bounds(lat,lon,lat_index,lon_index,lat_target,lon_target)
:
#check final indices are in right bounds
if(lat[lat_index]>lat_target):
if(lat_index!=0):
lat_index = lat_index - 1
if(lat[lat_index]<lat_target):
if(lat_index!=len(lat)):
lat_index = lat_index +1
if(lon[lon_index]>lon_target):
if(lon_index!=0):
lon_index = lon_index - 1
if(lon[lon_index]<lon_target):
if(lon_index!=len(lon)):
lon_index = lon_index + 1
return indexes
if(lat_index_min>lat_index_max):
lat_index_range = range(lat_index_max, lat_index_min+1)
else:
lat_index_range = range(lat_index_min, lat_index_max+1)
if(lon_index_min>lon_index_max):
lon_index_range = range(lon_index_max, lon_index_min+1)
else:
lon_index_range = range(lon_index_min, lon_index_max+1)
lat=lat[lat_index_range]
lon=lon[lon_index_range]
# subset in time
timehandle=filehandle.variables['time']
time=timehandle[:]
time_min = (date(year,1,1)-date(1900,1,1)).days
time_max = (date(year,12,31)-date(1900,1,1)).days
time_index_min = (np.abs(time-time_min)).argmin()
time_index_max = (np.abs(time-time_max)).argmin()
time_index_range = range(time_index_min, time_index_max+1)
time = timehandle[time_index_range]
# subset data
datahandle = filehandle.variables[varname]
data = datahandle[time_index_range, lat_index_range,
lon_index_range]
# Indexes
time_indexes = get_indexes(time, time_values)
lat_indexes = get_indexes(lat, lat_values)
lon_indexes = get_indexes(lon, lon_values)
def download_global_climate_dataset(varnames):
datasets = {}
for year in range(2002, 2024):
for varname in varnames:
pathname =
f"http://thredds.northwestknowledge.net:8080/thredds/dodsC/TERRACLIMAT
E_ALL/data/TerraClimate_{varname}_{year}.nc"
filehandle = Dataset(pathname, 'r', format="NETCDF4")
datasets[f"{year}-{varname}"] = filehandle
return datasets
# Guardamos el dataset
df_global_climate['date'] =
pd.to_datetime(df_global_climate['date'])
df_global_climate =
df_global_climate.sort_values(by="date").dropna()
df_global_climate.to_pickle(f"{root_directory_path}/global_climate.pkl
")
Legend here
def download_land_cover_dataset(root_directory_path):
range_values = [(20, 80), (10, 80), (10, 70), ('00', 80), ('00',
70)]
for year in range(2000, 2021, 5):
for N, W in range_values:
remote_url =
f"https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-
2020/v2/{year}/{N}N_0{W}W.tif"
land_cover_path =
f"{root_directory_path}/land_cover_Colombia_{year}_{N}N_0{W}W.tif"
request.urlretrieve(remote_url, land_cover_path)
return range_values
land_cover_values['lat'] += list(lat_values)
land_cover_values['lon'] += list(lon_values)
land_cover_values['year'] += list(np.full(len(lat_values),
year))
land_cover_values['land_cover'] += list(result)
os.remove(f"{root_directory_path}/land_cover_Colombia_{year}_{name}.ti
f")
df = pd.DataFrame(land_cover_values)
df.to_csv(f"{root_directory_path}/land_cover_Colombia_{name}.csv",
index=False)
return df_land_cover.sort_values(by="year").dropna()
def get_model(df_land_cover):
X = df_land_cover[['lat', 'lon', 'year']].values
y = df_land_cover['land_cover'].values
return model
def save_df_land_cover_predicted(root_directory_path, model,
interpolated_values, extrapolated_values):
land_covers_interpolated = model.predict(interpolated_values)
land_covers_extrapolated = model.predict(extrapolated_values)
df_land_cover_predicted = pd.DataFrame({
'latitude': np.append(interpolated_values[:, 0],
extrapolated_values[:, 0]),
'longitude': np.append(interpolated_values[:, 1],
extrapolated_values[:, 1]),
'year': np.append(interpolated_values[:, 2],
extrapolated_values[:, 2]).astype(int),
'land_cover': np.append(land_covers_interpolated,
land_covers_extrapolated).astype(int)
}).sort_values(by="year")
df_land_cover_predicted.to_pickle(f"{root_directory_path}/land_cover.p
kl")
def download_land_cover_legend(root_directory_path):
remote_url = f"https://storage.googleapis.com/earthenginepartners-
hansen/GLCLU2000-2020/legend.xlsx"
land_cover_legend_path =
f"{root_directory_path}/land_cover_legend_Colombia.xlsx"
request.urlretrieve(remote_url, land_cover_legend_path)
return land_cover_legend_path
def save_df_land_cover_legend(root_directory_path,
land_cover_legend_path):
df_land_cover_legend = pd.read_excel(land_cover_legend_path)
df_land_cover_legend = df_land_cover_legend.drop(columns={"Color
code"}).rename(columns={'Unnamed: 2': 'class'})
# Same column
set_values(df_land_cover_legend, 'General class', 'General class',
[(0, 96), (100, 196), (200, 207)], [97, 197, 208, 242, 245, 251, 255])
set_values(df_land_cover_legend, 'class', 'class', [(0, 1), (2,
18), (19, 24), (25, 48), (100, 101), (102, 118), (119, 124), (125,
148)], [49, 149])
# Other column
set_values(df_land_cover_legend, 'class', 'General class', [(200,
207), (241, 241), (244, 244), (250, 250), (254, 254)])
set_values(df_land_cover_legend, 'Sub-class', 'General class',
[(241, 241), (244, 244), (250, 250), (254, 254)])
df_land_cover_legend.to_pickle(f"{root_directory_path}/land_cover_lege
nd.pkl")
os.remove(land_cover_legend_path)
range_values = download_land_cover_dataset(root_directory_path)
split_lat_lon_values(lat_lon_values, range_values,
root_directory_path)
df_land_cover = union_land_cover_data(range_values,
root_directory_path)
land_cover_legend_path =
download_land_cover_legend(root_directory_path)
model = get_model(df_land_cover)
save_df_land_cover_predicted(root_directory_path, model,
interpolated_values, extrapolated_values)
save_df_land_cover_legend(root_directory_path,
land_cover_legend_path)
# Minimos
lat_min, lat_max = lat_values.min(), lat_values.max()
lon_min, lon_max = lon_values.min(), lon_values.max()
# Establecemos valores
df_pd.reset_index(drop=True, inplace=True)
lat, lon = df_pd['latitude'].to_numpy(),
df_pd['longitude'].to_numpy()
df_population_density.to_csv(f"{root_directory_path}/population_densit
y_Colombia_{year}.csv", index=False)
os.remove(pd_path)
def union_pd_data(root_directory_path):
df_population_density = pd.DataFrame()
for year in range(2002, 2021):
df_ppd =
pd.read_csv(f"{root_directory_path}/population_density_Colombia_{year}
.csv")
df_population_density = pd.concat([df_population_density,
df_ppd])
os.remove(f"{root_directory_path}/population_density_Colombia_{year}.c
sv")
return df_population_density.sort_values(by="year").dropna()
def get_regressor(df_population_density):
X = df_population_density[['latitude', 'longitude',
'year']].values # Características: año, latitud y longitud
y = df_population_density['population_density'].values
# Densidad de población como variable dependiente
return regressor
def save_df_population_density_predicted(root_directory_path,
df_population_density, regressor, extrapolated_values):
densities_predicted = regressor.predict(extrapolated_values)
df_pd_predicted = pd.DataFrame({
'latitude': extrapolated_values[:, 0],
'longitude': extrapolated_values[:, 1],
'year': extrapolated_values[:, 2].astype(int),
'population_density': densities_predicted
})
df_pd_predicted = pd.concat([df_population_density,
df_pd_predicted]).sort_values(by="year")
df_pd_predicted.to_pickle(f"{root_directory_path}/population_density.p
kl")
_, extrapolated_values =
get_inter_extrapolated_values(df_forest_fire)
df_population_density = union_pd_data(root_directory_path)
regressor = get_regressor(df_population_density)
save_df_population_density_predicted(root_directory_path,
df_population_density, regressor, extrapolated_values)
Union Data
def read_values(root_directory_path):
df_ndvi = pd.read_pickle(f"{root_directory_path}/ndvi.pkl")
df_land_cover =
pd.read_pickle(f"{root_directory_path}/land_cover.pkl")
df_forest_fire =
pd.read_pickle(f"{root_directory_path}/forest_fire.pkl")
df_global_climate =
pd.read_pickle(f"{root_directory_path}/global_climate.pkl")
df_land_cover_legend =
pd.read_pickle(f"{root_directory_path}/land_cover_legend.pkl")
df_population_density =
pd.read_pickle(f"{root_directory_path}/population_density.pkl")
def union_data(root_directory_path):
df_ndvi, df_land_cover, df_forest_fire, df_global_climate,
df_land_cover_legend, df_population_density =
read_values(root_directory_path)
df_forest_fire['date'] = pd.to_datetime(df_forest_fire['date'])
df_forest_fire['year'] =
df_forest_fire['date'].astype(str).str.slice(start=0,
stop=4).astype(int)
return df_final
Download data
def download_data(root_directory_path):
df_forest_fire = get_df_forest_fire(root_directory_path)
save_df_ndvi(root_directory_path, df_forest_fire.copy())
save_df_global_climate(root_directory_path, df_forest_fire.copy())
save_df_land_cover(root_directory_path, df_forest_fire.copy())
save_df_population_density(root_directory_path,
df_forest_fire.copy())
Save data
root_directory_path = get_root_directory()
download_data(root_directory_path)
df_final = union_data(root_directory_path)
df_final.head(5)
{"type":"dataframe","variable_name":"df_final"}
df_final.to_pickle(f"{root_directory_path}/final_dataset.pkl")