From f12a59f970da9200320162496f66e8d89f3d6b81 Mon Sep 17 00:00:00 2001 From: Gilares Paul <paul.gilares@etu.ec-lyon.fr> Date: Sat, 15 Mar 2025 16:57:32 +0000 Subject: [PATCH] Edit 2-remove_dup.py --- src/2-remove_dup.py | 20 ++++++++++++++++++++ src/5-remove_dup.py | 24 ------------------------ 2 files changed, 20 insertions(+), 24 deletions(-) create mode 100644 src/2-remove_dup.py delete mode 100644 src/5-remove_dup.py diff --git a/src/2-remove_dup.py b/src/2-remove_dup.py new file mode 100644 index 0000000..10303eb --- /dev/null +++ b/src/2-remove_dup.py @@ -0,0 +1,20 @@ +import pandas as pd + +def remove_duplicates(csv_file, date): + df = pd.read_csv(csv_file) + df_no_duplicates = df.drop_duplicates() + + print(f"Nombre de lignes AVANT suppression des doublons : {len(df)}") + print(f"Nombre de lignes APRES suppression des doublons : {len(df_no_duplicates)}") + + df_no_duplicates.to_csv(f"data/processed/Table{date}_no_dup.csv", index=False) + print(f"Fichier sauvegardé sans doublons pour {date}") + + +dates = ["20" + str(i).zfill(2) for i in range(18, 25)] +processed_dates = [] + +for date in dates: + print(f"Suppression des doublons pour {date}") + csv_file = f"data/processed/Table{date}_normalized.csv" + remove_duplicates(csv_file, date) diff --git a/src/5-remove_dup.py b/src/5-remove_dup.py deleted file mode 100644 index 10ad128..0000000 --- a/src/5-remove_dup.py +++ /dev/null @@ -1,24 +0,0 @@ -import pandas as pd - -def remove_duplicates(csv_file, date): - # Load the CSV file - df = pd.read_csv(csv_file) - - # Remove duplicates - df_no_duplicates = df.drop_duplicates() - - print(f"Nombre de lignes après suppression des doublons : {len(df_no_duplicates)}") - - # Save the file without duplicates - df_no_duplicates.to_csv(f"data/processed/Final{date}.csv", index=False) - - print("Duplicates removed and file saved") - - -dates = ["20" + str(i).zfill(2) for i in range(18, 25)] -processed_dates = [] - -for date in dates: - print(f"Processing data for {date}") - csv_file = f"data/processed/Filtre{date}_gps_corrected.csv" # Replace with your CSV file path - remove_duplicates(csv_file, date) -- GitLab