diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc index 23bcd7bad89c160503474ec19b0f7b2749e102af..b6a7cb001a2fe8ebb2b767265c01b7938ada5423 100644 Binary files a/preprocess/__pycache__/preprocess4.cpython-310.pyc and b/preprocess/__pycache__/preprocess4.cpython-310.pyc differ diff --git a/preprocess/prediction_on.ipynb b/preprocess/prediction_on.ipynb index 99c13c03f7da97f25216c158f812239adc3f8a7c..028cda3f3e6d78dd462bb22303b66108b091f7de 100644 --- a/preprocess/prediction_on.ipynb +++ b/preprocess/prediction_on.ipynb @@ -4046,7 +4046,6 @@ "for metric_name, metric_value in metrics.items():\n", " print(f\"{metric_name}: {metric_value:.4f}\")\n", "\n", - "# 1. Graphique des valeurs prédites vs réelles (jeu de test)\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(y_test, y_pred_test, alpha=0.5)\n", "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')\n", @@ -4056,7 +4055,6 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 2. Histogramme des résidus\n", "residuals_test = y_test - y_pred_test\n", "plt.figure(figsize=(10, 6))\n", "plt.hist(residuals_test, bins=30, alpha=0.7, color='blue')\n", @@ -4067,7 +4065,6 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 3. Graphique des résidus vs valeurs prédites\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(y_pred_test, residuals_test, alpha=0.5)\n", "plt.axhline(y=0, color='r', linestyle='--')\n", @@ -4077,14 +4074,12 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 4. Visualisation de l'importance des features\n", "plt.figure(figsize=(12, 8))\n", "xgb.plot_importance(model, importance_type='weight', max_num_features=20)\n", "plt.title('Importance des features')\n", "plt.tight_layout()\n", "plt.show()\n", "\n", - "# 5. Graphique des valeurs prédites et réelles dans l'ordre des indices\n", "plt.figure(figsize=(12, 6))\n", "plt.plot(y_test.reset_index(drop=True), 'b-', label='Valeurs réelles')\n", "plt.plot(y_pred_test, 'r-', label='Valeurs prédites')\n", @@ -4095,7 +4090,6 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 6. Courbe d'apprentissage si disponible\n", "if hasattr(model, 'evals_result'):\n", " results = model.evals_result()\n", " if results:\n", diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py index ddd88f693cb6c2b41acb9efb57fa4763fc9c3d8a..623ab72186b78e0c6bf8f96f453dc8664089d963 100644 --- a/preprocess/preprocess4.py +++ b/preprocess/preprocess4.py @@ -15,15 +15,21 @@ class PreprocessData: self.y = path_y + + + + def process_transformation(self): self.enlever_index() self.remplir_gene() - #self.virer_patient() + self.encoder_cohort() self.imputer_age_at_diagnosis() #self.rajout_feature_temps() self.encoder_patient() - - + self.imputer_valeurs_on() + self.rajout_feature_temps() + self.virer_patient_et_autre() + return self.X,self.y def enlever_index(self): self.X.drop('Index',axis=1,inplace=True) @@ -37,7 +43,6 @@ class PreprocessData: X_patient=vectorizer.fit_transform(X_patient) X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out()) self.X=pd.concat([self.X,X_patient],axis=1) - self.X.drop('patient_id',axis=1,inplace=True) return self.X @@ -76,8 +81,10 @@ class PreprocessData: self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x)) self.X.drop('gene',axis=1,inplace=True) return self.X - def virer_patient(self): + def virer_patient_et_autre(self): self.X.drop('patient_id',axis=1,inplace=True) + self.X.drop(['time_since_intake_on','time_since_intake_off'],axis=1,inplace=True) + return self.X def get_X(self): return self.X def get_y(self): @@ -97,19 +104,19 @@ class PreprocessData: # rajouter la progression du score on et off depuis la dernière visite self.X['diff_on'] = self.X.groupby('patient_id')['on'].diff() - self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff() + #self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff() # rajouter la progression du score on et off depuis la première visite self.X['diff_on_first'] = self.X.groupby('patient_id')['on'].transform('first') - self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first') + #self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first') # rajouter la moyenne du score on et off sur toutes les visites self.X['mean_on'] = self.X.groupby('patient_id')['on'].transform('mean') - self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean') + #self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean') # rajouter l'écart type du score on et off sur toutes les visites self.X['std_on'] = self.X.groupby('patient_id')['on'].transform('std') - self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std') + #self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std') # rajouter le temps depuis la dernière visite self.X['time_since_last_visit'] = self.X.groupby('patient_id')['age'].diff() @@ -170,4 +177,35 @@ class PreprocessData: print("Variable 'disease_duration' ajoutée avec succès.") return self.X - \ No newline at end of file + def imputer_valeurs_on(self): + """ + Impute les valeurs manquantes de on en utilisant une régression linéaire. + """ + X_copy = self.X.copy() + X_copy = X_copy.drop(['ledd','off','time_since_intake_on','time_since_intake_off','patient_id'],axis=1) + + X_copy_a_imputer = X_copy[X_copy['on'].isna()] + X_copy_connu = X_copy[~X_copy['on'].isna()] + + y_copy = X_copy_connu['on'] + + X_train, X_test, y_train, y_test = train_test_split(X_copy_connu.drop('on',axis=1), y_copy, test_size=0.2, random_state=42) + + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + mae = mean_absolute_error(y_test, y_pred) + print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}") + print("Imputation des valeurs manquantes de 'on'...") + X_copy.loc[X_copy['on'].isna(),'on'] = model.predict(X_copy_a_imputer.drop('on',axis=1)) + self.X['on']=X_copy['on'] + print("Les valeurs de 'on' ont été imputées avec succès.") + return self.X + +# test du code + + + + +