Select Git revision
06-script.py
Forked from
Vuillemot Romain / INF-TC1
Source project has a limited visibility.
-
Romain Vuillemot authoredRomain Vuillemot authored
1-first_processing.py 4.12 KiB
import pandas as pd
import os
import pyproj
import numpy as np
transformer = pyproj.Transformer.from_crs("EPSG:2154", "EPSG:4326", always_xy=True)
def lambert93_to_wgs84_batch(x, y):
# Conversion des coordonnées de Lambert 93 à WGS84
mask = (x.notna()) & (y.notna()) # Masque pour éviter les NaN
x_valid, y_valid = x[mask].astype(float), y[mask].astype(float)
lon, lat = transformer.transform(x_valid.to_numpy(), y_valid.to_numpy())
x.loc[mask], y.loc[mask] = lat.astype(str), lon.astype(str)
return x, y
def charger_donnees(date):
file_path_res = f"data/raw/UDI_RES_{date}.txt"
file_path_plv = f"data/raw/UDI_PLV_{date}.txt"
if not os.path.exists(file_path_res) or not os.path.exists(file_path_plv):
print(f"Attention fichiers manquants pour {date}.")
return False
# Choix des colonnes intéressantes de UDI_RES (données des résultats des prélèvements)
columns_res = [
"cddept", "referenceprel", "cdparametre", "rsana", "cdunitereferencesiseeaux", "cdunitereference", "rqana",
"rssigne"
]
dtype_res = {col: "string" for col in columns_res}
data_res = pd.read_csv(file_path_res, sep=",", dtype=dtype_res, usecols=columns_res)
data_res['rqana'] = pd.to_numeric(data_res['rqana'].str.extract(r'(\d+)', expand=False), errors='coerce')
output_res = f"data/processed/res{date}.csv"
data_res.to_csv(output_res, index=False)
# Choix des colonnes intéressantes de UDI_PLV (métadonnées des prélèvements)
columns_plv = [
"cddept", "inseecommune", "nomcommune", "cdreseau", "cdpointsurv", "nompointsurv",
"referenceprel", "dateprel", "coord_x", "coord_y"
]
dtype_plv = {col: "string" for col in columns_plv}
data_plv = pd.read_csv(file_path_plv, sep=",", dtype=dtype_plv, usecols=columns_plv)
# Conversion des coordonnées
data_plv["coord_x"], data_plv["coord_y"] = lambert93_to_wgs84_batch(data_plv["coord_x"], data_plv["coord_y"])
output_plv = f"data/processed/plv{date}.csv"
data_plv.to_csv(output_plv, index=False)
return True
def jointure(date):
# Jointure des fichiers RES et PLV.
res_path = f"data/processed/res{date}.csv"
plv_path = f"data/processed/plv{date}.csv"
data_res = pd.read_csv(res_path, dtype=str)
data_plv = pd.read_csv(plv_path, dtype=str)
# Formatting des code de reference des prélèvements (clé de la jointure)
data_res['referenceprel'] = data_res['referenceprel'].astype(str).str.strip()
data_plv['referenceprel'] = data_plv['referenceprel'].astype(str).str.strip()
data = pd.merge(data_res, data_plv, on="referenceprel", how="outer", indicator=True)
print(data["_merge"].value_counts())
output_path = f"data/processed/Table{date}.csv"
data.drop(columns=['_merge'], inplace=True)
data.to_csv(output_path, index=False)
return True
def supprimer_donnees(date):
# Supprime les fichiers temporaires uniquement si la jointure a réussi
table_path = f"data/processed/Table{date}.csv"
if os.path.exists(table_path):
os.remove(f"data/processed/res{date}.csv")
os.remove(f"data/processed/plv{date}.csv")
def normalize_cdparametre(date):
table_path = f"data/processed/Table{date}.csv"
output_file = f"data/processed/Table{date}_normalized.csv"
df = pd.read_csv(table_path, dtype={'cdparametre': str})
# Convertir toutes les codes des paramètres en string et ajouter ".0" si nécessaire (format des codes très disparatre)
df['cdparametre'] = df['cdparametre'].astype(str).apply(lambda x: x if x.endswith('.0') else x + '.0')
df.to_csv(output_file, index=False)
return df
dates = ["20" + str(i).zfill(2) for i in range(18, 25)]
processed_dates = []
for date in dates:
print(f"Processing de la data de {date}")
if charger_donnees(date):
if jointure(date):
supprimer_donnees(date)
normalize_cdparametre(date)
processed_dates.append(date)
print(f"Data de {date} traitée")
else:
print(f"Jointure échouée pour {date}")
else:
print(f"Chargement échoué pour {date}")