Skip to content
Snippets Groups Projects
Select Git revision
1 result Searching

06-script.py

Blame
  • Forked from Vuillemot Romain / INF-TC1
    Source project has a limited visibility.
    1-first_processing.py 4.12 KiB
    import pandas as pd
    import os
    import pyproj
    import numpy as np
    
    transformer = pyproj.Transformer.from_crs("EPSG:2154", "EPSG:4326", always_xy=True)
    
    def lambert93_to_wgs84_batch(x, y):
        # Conversion des coordonnées de Lambert 93 à WGS84
        mask = (x.notna()) & (y.notna())  # Masque pour éviter les NaN
        x_valid, y_valid = x[mask].astype(float), y[mask].astype(float)
        lon, lat = transformer.transform(x_valid.to_numpy(), y_valid.to_numpy())
    
        x.loc[mask], y.loc[mask] = lat.astype(str), lon.astype(str)
        return x, y
    
    def charger_donnees(date):
        file_path_res = f"data/raw/UDI_RES_{date}.txt"
        file_path_plv = f"data/raw/UDI_PLV_{date}.txt"
        
        if not os.path.exists(file_path_res) or not os.path.exists(file_path_plv):
            print(f"Attention fichiers manquants pour {date}.")
            return False
        
        # Choix des colonnes intéressantes de UDI_RES (données des résultats des prélèvements)
        columns_res = [
            "cddept", "referenceprel", "cdparametre", "rsana", "cdunitereferencesiseeaux", "cdunitereference", "rqana",
            "rssigne"
        ]
        dtype_res = {col: "string" for col in columns_res}
        data_res = pd.read_csv(file_path_res, sep=",", dtype=dtype_res, usecols=columns_res)
        data_res['rqana'] = pd.to_numeric(data_res['rqana'].str.extract(r'(\d+)', expand=False), errors='coerce')
        
        output_res = f"data/processed/res{date}.csv"
        data_res.to_csv(output_res, index=False)
        
        # Choix des colonnes intéressantes de UDI_PLV (métadonnées des prélèvements)
        columns_plv = [
            "cddept", "inseecommune", "nomcommune", "cdreseau", "cdpointsurv", "nompointsurv",
            "referenceprel", "dateprel", "coord_x", "coord_y"
        ]
        dtype_plv = {col: "string" for col in columns_plv}
        data_plv = pd.read_csv(file_path_plv, sep=",", dtype=dtype_plv, usecols=columns_plv)
    
    
        # Conversion des coordonnées
        data_plv["coord_x"], data_plv["coord_y"] = lambert93_to_wgs84_batch(data_plv["coord_x"], data_plv["coord_y"])
    
        output_plv = f"data/processed/plv{date}.csv"
        data_plv.to_csv(output_plv, index=False)
        
        return True
    
    def jointure(date):
        # Jointure des fichiers RES et PLV.
        res_path = f"data/processed/res{date}.csv"
        plv_path = f"data/processed/plv{date}.csv"
        
        data_res = pd.read_csv(res_path, dtype=str)
        data_plv = pd.read_csv(plv_path, dtype=str)
        
        # Formatting des code de reference des prélèvements (clé de la jointure)
        data_res['referenceprel'] = data_res['referenceprel'].astype(str).str.strip()
        data_plv['referenceprel'] = data_plv['referenceprel'].astype(str).str.strip()
        
        data = pd.merge(data_res, data_plv, on="referenceprel", how="outer", indicator=True)
        
        print(data["_merge"].value_counts())
        
        output_path = f"data/processed/Table{date}.csv"
        data.drop(columns=['_merge'], inplace=True)
        data.to_csv(output_path, index=False)
        
        return True
    
    def supprimer_donnees(date):
        # Supprime les fichiers temporaires uniquement si la jointure a réussi
        table_path = f"data/processed/Table{date}.csv"
        if os.path.exists(table_path):
            os.remove(f"data/processed/res{date}.csv")
            os.remove(f"data/processed/plv{date}.csv")
    
    def normalize_cdparametre(date):
        table_path = f"data/processed/Table{date}.csv"
        output_file = f"data/processed/Table{date}_normalized.csv"
        df = pd.read_csv(table_path, dtype={'cdparametre': str})
        # Convertir toutes les codes des paramètres en string et ajouter ".0" si nécessaire (format des codes très disparatre)
        df['cdparametre'] = df['cdparametre'].astype(str).apply(lambda x: x if x.endswith('.0') else x + '.0')
    
        df.to_csv(output_file, index=False)
    
        return df
    
    dates = ["20" + str(i).zfill(2) for i in range(18, 25)]
    processed_dates = []
    
    for date in dates:
        print(f"Processing de la data de {date}")
        if charger_donnees(date):
            if jointure(date):
                supprimer_donnees(date)
                normalize_cdparametre(date)
                processed_dates.append(date)
                print(f"Data de {date} traitée")
            else:
                print(f"Jointure échouée pour {date}")
        else:
            print(f"Chargement échoué pour {date}")