From f12a59f970da9200320162496f66e8d89f3d6b81 Mon Sep 17 00:00:00 2001
From: Gilares Paul <paul.gilares@etu.ec-lyon.fr>
Date: Sat, 15 Mar 2025 16:57:32 +0000
Subject: [PATCH] Edit 2-remove_dup.py

---
 src/2-remove_dup.py | 20 ++++++++++++++++++++
 src/5-remove_dup.py | 24 ------------------------
 2 files changed, 20 insertions(+), 24 deletions(-)
 create mode 100644 src/2-remove_dup.py
 delete mode 100644 src/5-remove_dup.py

diff --git a/src/2-remove_dup.py b/src/2-remove_dup.py
new file mode 100644
index 0000000..10303eb
--- /dev/null
+++ b/src/2-remove_dup.py
@@ -0,0 +1,20 @@
+import pandas as pd
+
+def remove_duplicates(csv_file, date):
+    df = pd.read_csv(csv_file)
+    df_no_duplicates = df.drop_duplicates()
+
+    print(f"Nombre de lignes AVANT suppression des doublons : {len(df)}")
+    print(f"Nombre de lignes APRES suppression des doublons : {len(df_no_duplicates)}")
+
+    df_no_duplicates.to_csv(f"data/processed/Table{date}_no_dup.csv", index=False)
+    print(f"Fichier sauvegardé sans doublons pour {date}")
+
+
+dates = ["20" + str(i).zfill(2) for i in range(18, 25)]
+processed_dates = []
+
+for date in dates:
+    print(f"Suppression des doublons pour {date}")
+    csv_file = f"data/processed/Table{date}_normalized.csv"
+    remove_duplicates(csv_file, date)
diff --git a/src/5-remove_dup.py b/src/5-remove_dup.py
deleted file mode 100644
index 10ad128..0000000
--- a/src/5-remove_dup.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import pandas as pd
-
-def remove_duplicates(csv_file, date):
-    # Load the CSV file
-    df = pd.read_csv(csv_file)
-
-    # Remove duplicates
-    df_no_duplicates = df.drop_duplicates()
-
-    print(f"Nombre de lignes après suppression des doublons : {len(df_no_duplicates)}")
-
-    # Save the file without duplicates
-    df_no_duplicates.to_csv(f"data/processed/Final{date}.csv", index=False)
-
-    print("Duplicates removed and file saved")
-
-
-dates = ["20" + str(i).zfill(2) for i in range(18, 25)]
-processed_dates = []
-
-for date in dates:
-    print(f"Processing data for {date}")
-    csv_file = f"data/processed/Filtre{date}_gps_corrected.csv"  # Replace with your CSV file path
-    remove_duplicates(csv_file, date)
-- 
GitLab