diff --git a/src/1-processing.py b/src/1-processing.py deleted file mode 100644 index 4b53e97bd386a8c1c8570a68e2a7db7d784fb0ec..0000000000000000000000000000000000000000 --- a/src/1-processing.py +++ /dev/null @@ -1,120 +0,0 @@ -import pandas as pd -import os -import pyproj - -def lambert93_to_wgs84(x, y): - """Convertit des coordonnées Lambert 93 en latitude et longitude WGS 84.""" - try: - x, y = float(x), float(y) - transformer = pyproj.Transformer.from_crs("EPSG:2154", "EPSG:4326", always_xy=True) - lon, lat = transformer.transform(x, y) - return lat, lon - except ValueError: - return None, None - -def charger_donnees(date): - """Charge et nettoie les données UDI_RES et UDI_PLV.""" - file_path_res = f"data/raw/UDI_RES_{date}.txt" - file_path_plv = f"data/raw/UDI_PLV_{date}.txt" - - if not os.path.exists(file_path_res) or not os.path.exists(file_path_plv): - print(f"Fichiers manquants pour {date}, passage...") - return False - - # Chargement des données UDI_RES - columns_res = [ - "cddept", "referenceprel", "cdparametre", "rsana", "cdunitereferencesiseeaux", "cdunitereference", "rqana", - "rssigne", "representativite" - ] - data_res = pd.read_csv(file_path_res, sep=",", dtype=str) - data_res = data_res[columns_res].copy() - - # Conversion des valeurs numériques de rqana - data_res['rqana'] = pd.to_numeric(data_res['rqana'].str.extract(r'(\d+)', expand=False), errors='coerce') - - output_res = f"data/processed/res{date}.csv" - data_res.to_csv(output_res, index=False) - - # Chargement des données UDI_PLV - columns_plv = [ - "cddept", "inseecommune", "nomcommune", "cdreseau", "cdpointsurv", "nompointsurv", - "referenceprel", "dateprel", "finaliteprel", "conclusionprel", "cdtypeeau", - "plvconformitebacterio", "plvconformitechimique", "plvconformiterefbacterio", - "plvconformiterefchimique", "coord_x", "coord_y" - ] - data_plv = pd.read_csv(file_path_plv, sep=",", dtype=str) - data_plv = data_plv[columns_plv].copy() - - # Conversion des coordonnées - data_plv[['coord_x', 'coord_y']] = data_plv.apply( - lambda row: lambert93_to_wgs84(row['coord_x'], row['coord_y']), - axis=1, result_type='expand' - ) - - output_plv = f"data/processed/plv{date}.csv" - data_plv.to_csv(output_plv, index=False) - - return True - -def jointure(date): - """Jointure des fichiers RES et PLV.""" - res_path = f"data/processed/res{date}.csv" - plv_path = f"data/processed/plv{date}.csv" - - if not os.path.exists(res_path) or not os.path.exists(plv_path): - print(f"Fichiers intermédiaires manquants pour {date}, passage...") - return False - - data_res = pd.read_csv(res_path, dtype=str) - data_plv = pd.read_csv(plv_path, dtype=str) - - # Vérification des références avant la jointure - data_res['referenceprel'] = data_res['referenceprel'].astype(str).str.strip() - data_plv['referenceprel'] = data_plv['referenceprel'].astype(str).str.strip() - - data = pd.merge(data_res, data_plv, on="referenceprel", how="outer", indicator=True) - - print(data["_merge"].value_counts()) - - output_path = f"data/processed/Table{date}.csv" - data.drop(columns=['_merge'], inplace=True) - data.to_csv(output_path, index=False) - - return True - -def supprimer_donnees(date): - """Supprime les fichiers temporaires uniquement si la jointure a réussi.""" - table_path = f"data/processed/Table{date}.csv" - if os.path.exists(table_path): - os.remove(f"data/processed/res{date}.csv") - os.remove(f"data/processed/plv{date}.csv") - -dates = ["20240" + str(i) for i in range(1, 10)] + ["2024" + str(i) for i in range(10, 13)] -processed_dates = [] - -for date in dates: - print(f"Processing data for {date}") - if charger_donnees(date): - if jointure(date): - supprimer_donnees(date) - processed_dates.append(date) - print(f"Data for {date} processed") - else: - print(f"Jointure échouée pour {date}") - else: - print(f"Chargement échoué pour {date}") - -def jointure_finale(): - """Concatène toutes les tables intermédiaires en une seule table finale.""" - paths = [f"data/processed/Table{date}.csv" for date in processed_dates] - valid_paths = [path for path in paths if os.path.exists(path)] - - if not valid_paths: - print("Aucun fichier de jointure valide, fin du processus.") - return - - data = pd.concat([pd.read_csv(path, dtype=str) for path in valid_paths], ignore_index=True) - data.to_csv("data/processed/Table2024.csv", index=False) - print("Table finale sauvegardée.") - -jointure_finale() \ No newline at end of file