diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc
index 23bcd7bad89c160503474ec19b0f7b2749e102af..b6a7cb001a2fe8ebb2b767265c01b7938ada5423 100644
Binary files a/preprocess/__pycache__/preprocess4.cpython-310.pyc and b/preprocess/__pycache__/preprocess4.cpython-310.pyc differ
diff --git a/preprocess/prediction_on.ipynb b/preprocess/prediction_on.ipynb
index 99c13c03f7da97f25216c158f812239adc3f8a7c..028cda3f3e6d78dd462bb22303b66108b091f7de 100644
--- a/preprocess/prediction_on.ipynb
+++ b/preprocess/prediction_on.ipynb
@@ -4046,7 +4046,6 @@
     "for metric_name, metric_value in metrics.items():\n",
     "    print(f\"{metric_name}: {metric_value:.4f}\")\n",
     "\n",
-    "# 1. Graphique des valeurs prédites vs réelles (jeu de test)\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.scatter(y_test, y_pred_test, alpha=0.5)\n",
     "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')\n",
@@ -4056,7 +4055,6 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 2. Histogramme des résidus\n",
     "residuals_test = y_test - y_pred_test\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.hist(residuals_test, bins=30, alpha=0.7, color='blue')\n",
@@ -4067,7 +4065,6 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 3. Graphique des résidus vs valeurs prédites\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.scatter(y_pred_test, residuals_test, alpha=0.5)\n",
     "plt.axhline(y=0, color='r', linestyle='--')\n",
@@ -4077,14 +4074,12 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 4. Visualisation de l'importance des features\n",
     "plt.figure(figsize=(12, 8))\n",
     "xgb.plot_importance(model, importance_type='weight', max_num_features=20)\n",
     "plt.title('Importance des features')\n",
     "plt.tight_layout()\n",
     "plt.show()\n",
     "\n",
-    "# 5. Graphique des valeurs prédites et réelles dans l'ordre des indices\n",
     "plt.figure(figsize=(12, 6))\n",
     "plt.plot(y_test.reset_index(drop=True), 'b-', label='Valeurs réelles')\n",
     "plt.plot(y_pred_test, 'r-', label='Valeurs prédites')\n",
@@ -4095,7 +4090,6 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 6. Courbe d'apprentissage si disponible\n",
     "if hasattr(model, 'evals_result'):\n",
     "    results = model.evals_result()\n",
     "    if results:\n",
diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py
index ddd88f693cb6c2b41acb9efb57fa4763fc9c3d8a..623ab72186b78e0c6bf8f96f453dc8664089d963 100644
--- a/preprocess/preprocess4.py
+++ b/preprocess/preprocess4.py
@@ -15,15 +15,21 @@ class PreprocessData:
         self.y = path_y
   
 
+        
+        
+        
+    def process_transformation(self):
         self.enlever_index()
         self.remplir_gene()
-        #self.virer_patient()
+       
         self.encoder_cohort()
         self.imputer_age_at_diagnosis()
         #self.rajout_feature_temps()
         self.encoder_patient()
-        
-        
+        self.imputer_valeurs_on()
+        self.rajout_feature_temps()
+        self.virer_patient_et_autre()
+        return self.X,self.y
     
     def enlever_index(self):
         self.X.drop('Index',axis=1,inplace=True)
@@ -37,7 +43,6 @@ class PreprocessData:
         X_patient=vectorizer.fit_transform(X_patient)
         X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out())
         self.X=pd.concat([self.X,X_patient],axis=1)
-        self.X.drop('patient_id',axis=1,inplace=True)
         return self.X
             
         
@@ -76,8 +81,10 @@ class PreprocessData:
         self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x))
         self.X.drop('gene',axis=1,inplace=True)
         return self.X
-    def virer_patient(self):
+    def virer_patient_et_autre(self):
         self.X.drop('patient_id',axis=1,inplace=True)
+        self.X.drop(['time_since_intake_on','time_since_intake_off'],axis=1,inplace=True)
+        return self.X
     def get_X(self):
         return self.X
     def get_y(self):
@@ -97,19 +104,19 @@ class PreprocessData:
 
         # rajouter la progression du score on et off depuis la dernière visite
         self.X['diff_on'] = self.X.groupby('patient_id')['on'].diff()
-        self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff()
+        #self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff()
 
         # rajouter la progression du score on et off depuis la première visite
         self.X['diff_on_first'] = self.X.groupby('patient_id')['on'].transform('first')
-        self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first')
+        #self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first')
 
         # rajouter la moyenne du score on et off sur toutes les visites
         self.X['mean_on'] = self.X.groupby('patient_id')['on'].transform('mean')
-        self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean')
+        #self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean')
 
         # rajouter l'écart type du score on et off sur toutes les visites
         self.X['std_on'] = self.X.groupby('patient_id')['on'].transform('std')
-        self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std')
+        #self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std')
 
         # rajouter le temps depuis la dernière visite
         self.X['time_since_last_visit'] = self.X.groupby('patient_id')['age'].diff()
@@ -170,4 +177,35 @@ class PreprocessData:
         print("Variable 'disease_duration' ajoutée avec succès.")
         
         return self.X
-        
\ No newline at end of file
+    def imputer_valeurs_on(self):
+        """
+        Impute les valeurs manquantes de on en utilisant une régression linéaire.
+        """
+        X_copy = self.X.copy()
+        X_copy = X_copy.drop(['ledd','off','time_since_intake_on','time_since_intake_off','patient_id'],axis=1)
+
+        X_copy_a_imputer = X_copy[X_copy['on'].isna()]
+        X_copy_connu = X_copy[~X_copy['on'].isna()]
+
+        y_copy = X_copy_connu['on']
+
+        X_train, X_test, y_train, y_test = train_test_split(X_copy_connu.drop('on',axis=1), y_copy, test_size=0.2, random_state=42)
+
+        model = LinearRegression()
+        model.fit(X_train, y_train)
+
+        y_pred = model.predict(X_test)
+        mae = mean_absolute_error(y_test, y_pred)
+        print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}")
+        print("Imputation des valeurs manquantes de 'on'...")
+        X_copy.loc[X_copy['on'].isna(),'on'] = model.predict(X_copy_a_imputer.drop('on',axis=1))
+        self.X['on']=X_copy['on']
+        print("Les valeurs de 'on' ont été imputées avec succès.")
+        return self.X
+
+# test du code
+
+    
+
+
+