diff --git a/src/2-remove_dup.py b/src/2-remove_dup.py new file mode 100644 index 0000000000000000000000000000000000000000..10303eb4b0c0565f79527912cde4bf1fc3cc700e --- /dev/null +++ b/src/2-remove_dup.py @@ -0,0 +1,20 @@ +import pandas as pd + +def remove_duplicates(csv_file, date): + df = pd.read_csv(csv_file) + df_no_duplicates = df.drop_duplicates() + + print(f"Nombre de lignes AVANT suppression des doublons : {len(df)}") + print(f"Nombre de lignes APRES suppression des doublons : {len(df_no_duplicates)}") + + df_no_duplicates.to_csv(f"data/processed/Table{date}_no_dup.csv", index=False) + print(f"Fichier sauvegardé sans doublons pour {date}") + + +dates = ["20" + str(i).zfill(2) for i in range(18, 25)] +processed_dates = [] + +for date in dates: + print(f"Suppression des doublons pour {date}") + csv_file = f"data/processed/Table{date}_normalized.csv" + remove_duplicates(csv_file, date) diff --git a/src/5-remove_dup.py b/src/5-remove_dup.py deleted file mode 100644 index 10ad12893357ea8bd1e0e0b02b3e12b5ebfe05b2..0000000000000000000000000000000000000000 --- a/src/5-remove_dup.py +++ /dev/null @@ -1,24 +0,0 @@ -import pandas as pd - -def remove_duplicates(csv_file, date): - # Load the CSV file - df = pd.read_csv(csv_file) - - # Remove duplicates - df_no_duplicates = df.drop_duplicates() - - print(f"Nombre de lignes après suppression des doublons : {len(df_no_duplicates)}") - - # Save the file without duplicates - df_no_duplicates.to_csv(f"data/processed/Final{date}.csv", index=False) - - print("Duplicates removed and file saved") - - -dates = ["20" + str(i).zfill(2) for i in range(18, 25)] -processed_dates = [] - -for date in dates: - print(f"Processing data for {date}") - csv_file = f"data/processed/Filtre{date}_gps_corrected.csv" # Replace with your CSV file path - remove_duplicates(csv_file, date)