From 365b4776be9cecdd420900333de8d5b1717aac74 Mon Sep 17 00:00:00 2001 From: atebboun <75985994+encoreunpseudo@users.noreply.github.com> Date: Sat, 29 Mar 2025 17:55:51 +0100 Subject: [PATCH] ok pour on --- .../__pycache__/preprocess4.cpython-310.pyc | Bin 6232 -> 7082 bytes preprocess/prediction_on.ipynb | 6 -- preprocess/preprocess4.py | 58 +++++++++++++++--- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc index 23bcd7bad89c160503474ec19b0f7b2749e102af..b6a7cb001a2fe8ebb2b767265c01b7938ada5423 100644 GIT binary patch delta 2732 zcmca%u*#e-pO=@5fq{XcRP{yrdijZb-xy0KvofZNq)4<dMDe9arbx9gMDeFc2Qz5O zOwMD}vKC-qV5niNVTfm_VX9$>XG~!TW?0GSx02}=W5g}SN=@clEXAogX;Ewii6t5F z5m35vB|{PK<Y1;eb`}N(1~x{n$(NY4WtkWl7@Qdx7>a#BhB7Q*SjfQ0P{Xi*5kya3 z!Kfi!!?1uI#HnG(;;3QB;!I--W+>udU|`VXntY!zqW+dpW^O@gNorAiSz=CVX;E=} ze%>vSqQtEH(vtYJ)Wnj~qSW}3)ZBvNTfAkNMX5#c1&Jk@sd*(uAYT`8GcYi$WGv!g zU|=ZXg%f-X3=Fr#3X1ZRQ;Un^ONtWniqrCoauZ83^YcVO&J|){VBlcnVB|1i;$Y+e zVMY!{9>yxwfTC2WHkZVbL_H`Q&4@|dl9O4P?NvZ7xW$rElwVLJ0k<3M3XlVeKmn~Z zIg5FTt_(<)5k!DwS2EsWtUz}yhfPjmZc<93o!Vq;7CTk2(X5FD1v!;i%?5cub@DtG zD=tuUK%!vsBNktsU<OTAzgt|;K#I>y(PX^Eo}8aknwwW#1aikp#v(aT@N*P#O)h5j zW6@+T;+cGkRazC~G>EIft`}ioV9)@C6)0@E7}*%P7{wUHHuJC*F=~L!DT4c_NEPNY zg!Ws^Y4JJ6lb5lF`RRjfWAm$Gad&dm20Nk1fPsM_{iT{A#~PVi_Jj>ew*pyU#K6E% zr7<~?Q+0Aahim}MST_F<57!`MgA3Pm-_BszPlUnwj;IE6q{Zih9XI&{r{rWMPGJRb zpn&|R$yj6vQVbRV2aq}g1B3tMP|kRLQ;-BGO+)+-Rc;ItMO7XE5;B|om{Z*syB-se z0#rSLAR#jb28Lf;sl_GnNWKjzk_81VC?dnTR2@N)Rip?KVFBldD7J|BoXp~qTa1}S zY9K>E=4vt*f!zi63)nWWUj!K#7z`)x<%+eI0vW*SSELEj0g7jcEkz0-KDJaY&cMJB z0+Ljk9MA2m07|&nQtU0(^wg60h_K0LxUCEwK<YsOrpXA-!MHV4Mu3z$O*ZH8P$}XB zv9Y+}76-`el*E$6sL88%Y?%%*On%7|%gH9muz)#*V<F>YUtWDf2#;qWV+}(VLkh1A z1BhP0n8LV_F@;Z(VFAlRut*BO4MU1RFoUMxWLZ8-Hb^i`j^WcYWKGK~DlUoQP0374 zi_g!C2Xk++<fbO(MX`fOP|3wyT#^#SR$P)2pPx7REuR7-*JM6^4Sg(*E)rv4V2A-_ zFi;FgvB@w?G0U)kpd8EQI{t4=(jdE|xU=)}%kw~mV`=IwmYmeI5>2Kex5@oNu9Bd1 z21#+Cm@NVsQv^zsos-`R1?hS*Ffc^16sK1B7MX*>%>qPNf(T^>28QReW`e*;`<Ggv z5_EEouoUCh$?d|HjK3xy6!v5Enk*pVZwQLaTWp{LE-$gj8>H7ARI+jxr52}_#AoKE zq*fGx^xtC5O)W}KEee`EK}13bln`&RLS(@?*JkoA5ktmXli!G_a(IB!mrN1QWJ%Ez zMz`1_Kt*e2-YvF>_>$D(l3VPRFo8;l0Bdf3N@~t6w#xW|qSTaIlZC|$wCzDIfrL(O zW^r+5UV3~%atV4+2!qOt$$4V6^~s>33RFjeFsL|VgH>BEz|~e3xWcMstYOSzN@qx6 zT*R2dl)~K01QMxXOkn|~qlFBT47E%(j0iqINQAkD86;oJQo>ThoW+#F0?}E+lERw8 zmcri462nqk&sxh?%U;V-!rIJG%UQ#@fUSnLhJ7IuBSQ`60`?lVg^aaaB^)&zHC)Y1 zwcI6~DI7K2DV(!e<}x)iGBT8Krf`91uv#XB6$?Sm<w7_og(Z#2h5^C{CG8Xz8<1Y^ zW=0-HaLNv5(BvtNo%~WzmnlDQvYL=#y(ViEOHOJ^$}Q&nw6rLZlFZ!H_~OjG<ka}g zypqK1)cE|oC{Zk;X=#Y!p-3AP8EzmV6hwr9h;R@Q2_m9Ek)Y!Vs^Y=5uR=;{u>!QF zpUf`AU$35@r>>``R|+Z*eGqaADX9t|DTVyJ5{1JnOAfD80JjqkuS_jgNGwZDRwyn_ zPCmS%SWlB1+!DCOlAK>q2`>MNf<QKd>ct{v5GxHtq=N`n5Rm~Q5<tn2r--K>luOtm z;z63D1R!*LVm#Qc)S@UJs7P{tUS4SwTP0XsQ4Yw2To925BJx2*0f;CB5k(*ZoFl;m zI9q}m9h*RT9aM~Ru?R5=FbOeoFex!vu`%&5axrl+3NUgosxk60axk(nRw<%2L!iz9 zl>*QvlAor)<ac7K_52{qBtZnIwNV6W78HSM?jlf$UIZ$ei@<dpsD)7E0#XMmc8WlG zqX^_Eq)dZwEjt4P1E^tD3@ZOQKn*2M4qgss4rUHc4rLA|D3s%|C=viE)#SUyR*;yN Xl2}}%GkJ=5p8(h-1i?4iNJ0$&fB{`V delta 1971 zcmZ2we#3w-pO=@5fq{YH?ycwPpQR@9ePc|S%*vR`63n0}F}aRWOTdDGfuV-6h9RDz zhN*@jo^kR5CJp`+hG2%3jDDKzw-_UCF;-4~%oHJ`$$X2YI5j6Nimf2ABqKfoN>{FA zC=#6fm@!X~kAZ=KjggIsgPDVogNcKYXL1CS_T>M}?rNM23=FqeQi}2mRx%a|FfcF_ z3Bm~>5R03Efk9?+7RwS{F%Xv#M1Yj6WW2>#fu{HthfPjmZc<93ox)^mRy$R&;jD=T z1v!;PGBAx0Jz%qW85kJkC(mQG;$mfBU~pz&U?`58{D{@pC5t14F@-sWHHEF0nUSG} zA&ViHL6hC@7FR)HNoHzZNqlCCCgUyk<ouk{+`Qr<kjpd~izFBr7($A~85kHeIg2<a zN3;2H7D<D6%tZo|*Rn~ggWQc|JwF2jgA&M6Q1EauvN3WoiZO~Y3T=MNR>Y_QGPDTp z-691{_upbpi_g)U+|Ci^sR^=%&992Z-N{iK?4BYm1_p-omuiL_Yh-TO6E-N_6l8(U z<Q<$klY=;=y<sM@`G<J81|geSxTgDd2E%?L%*?lenW@JDa?a$7T#}QSxr7zK0Ri%n zCS#E{NDWv394DXv@tSPL6|b)klHdRlU|FbgUAO>Txi?72VDb?zbzkgy^gs&0hQRgs zfP@Sf7#MzWr52aOBY6@UgB(SilP$Pa9hE@}!Li5!PM=Y15%D>h#U-~GGm8{Kav*ax znTx>g0{aDQ8`vTq1_lP5$@92lT}43g!|GS00@4ABS%@t~QXoE-q<M?CEVC%JC?1;L z{6WfOC!6v3YH@;?*mSU_r<TM=1Tru%d}ab?x5<LMqLUBtNE=#%w15Iqld%Yt<FVz1 zTVU;#!62o!llgc(v^ANEz;@x(!2z-`C9xzibaEB1Et3J`<ej{+>>QE|3s@F1OqS=< z=LfNwQ@9o~f^tv_&tykFVO3rzkIx32uNE+-FfL?FfvAMZ@q<*l3qVy0Vo?c`6GD*_ zhS&|ZO~i&FMKqW}Q*3fQpCub4+$Znj(@SQ~PfLs9NXbk~i_cF>yTzK8SyWsS#S0P0 zi-&OeAl$Sx2>%vKZfas)6g!BF&(DkEfH2b1ZZQ{^q(re5m!yD%*}-&L+GJUN1xC)v zcKjN`SYm3jKeL#8I4IwP5`q+)46_uo6r&8Y3>?cbZ=TBkjfqiZa*vQCrz5C{WX{Mh zpL~%^bn;;#4M|W|fRrPk#8(7PYoL77GMPm<$it0+fgy^eIJLsJ$Pg5rMj*l%M96`P zgjq8|;H3RaP>$4OisCE*Wq{($s?=LNMTvPS`ML4MC5a`erIY6gTQXKnz9H<#=r&nh z#6KF89B#3J%IdttB6pC1pd#}YcTs9_YDs)%UP@|3kt)csthuR0>8VAaRC|jxFTW%& z@fLGVe)280vc#Oy)Z!vgp1Q>f(O?2H$87RN5ktnP$;_gv94?@&B2y$V*+BG!Q51Vb zd`VGaW*#WpfgBpeUI`PZj0XwaV$ID@NzI93tBfxwN==EHtR-fkZ3!|I6z5UgxtYbq znR)5)1<57oK?W*zODDIA)iV2O@=Z39Qmq#T=>yd`MWCvo2vk59fiiKC6-WYHn%RL^ z_8<aW?10QbG8Yja><kPHAd`wgwE_nN2P2072QvpV2N#Dj2NMMH$Q6N9Y4Y7-D@e>s YNi1e6Qk(ots!srH3W5-t+#;<80N&o7;{X5v diff --git a/preprocess/prediction_on.ipynb b/preprocess/prediction_on.ipynb index 99c13c0..028cda3 100644 --- a/preprocess/prediction_on.ipynb +++ b/preprocess/prediction_on.ipynb @@ -4046,7 +4046,6 @@ "for metric_name, metric_value in metrics.items():\n", " print(f\"{metric_name}: {metric_value:.4f}\")\n", "\n", - "# 1. Graphique des valeurs prédites vs réelles (jeu de test)\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(y_test, y_pred_test, alpha=0.5)\n", "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')\n", @@ -4056,7 +4055,6 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 2. Histogramme des résidus\n", "residuals_test = y_test - y_pred_test\n", "plt.figure(figsize=(10, 6))\n", "plt.hist(residuals_test, bins=30, alpha=0.7, color='blue')\n", @@ -4067,7 +4065,6 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 3. Graphique des résidus vs valeurs prédites\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(y_pred_test, residuals_test, alpha=0.5)\n", "plt.axhline(y=0, color='r', linestyle='--')\n", @@ -4077,14 +4074,12 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 4. Visualisation de l'importance des features\n", "plt.figure(figsize=(12, 8))\n", "xgb.plot_importance(model, importance_type='weight', max_num_features=20)\n", "plt.title('Importance des features')\n", "plt.tight_layout()\n", "plt.show()\n", "\n", - "# 5. Graphique des valeurs prédites et réelles dans l'ordre des indices\n", "plt.figure(figsize=(12, 6))\n", "plt.plot(y_test.reset_index(drop=True), 'b-', label='Valeurs réelles')\n", "plt.plot(y_pred_test, 'r-', label='Valeurs prédites')\n", @@ -4095,7 +4090,6 @@ "plt.grid(True)\n", "plt.show()\n", "\n", - "# 6. Courbe d'apprentissage si disponible\n", "if hasattr(model, 'evals_result'):\n", " results = model.evals_result()\n", " if results:\n", diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py index ddd88f6..623ab72 100644 --- a/preprocess/preprocess4.py +++ b/preprocess/preprocess4.py @@ -15,15 +15,21 @@ class PreprocessData: self.y = path_y + + + + def process_transformation(self): self.enlever_index() self.remplir_gene() - #self.virer_patient() + self.encoder_cohort() self.imputer_age_at_diagnosis() #self.rajout_feature_temps() self.encoder_patient() - - + self.imputer_valeurs_on() + self.rajout_feature_temps() + self.virer_patient_et_autre() + return self.X,self.y def enlever_index(self): self.X.drop('Index',axis=1,inplace=True) @@ -37,7 +43,6 @@ class PreprocessData: X_patient=vectorizer.fit_transform(X_patient) X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out()) self.X=pd.concat([self.X,X_patient],axis=1) - self.X.drop('patient_id',axis=1,inplace=True) return self.X @@ -76,8 +81,10 @@ class PreprocessData: self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x)) self.X.drop('gene',axis=1,inplace=True) return self.X - def virer_patient(self): + def virer_patient_et_autre(self): self.X.drop('patient_id',axis=1,inplace=True) + self.X.drop(['time_since_intake_on','time_since_intake_off'],axis=1,inplace=True) + return self.X def get_X(self): return self.X def get_y(self): @@ -97,19 +104,19 @@ class PreprocessData: # rajouter la progression du score on et off depuis la dernière visite self.X['diff_on'] = self.X.groupby('patient_id')['on'].diff() - self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff() + #self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff() # rajouter la progression du score on et off depuis la première visite self.X['diff_on_first'] = self.X.groupby('patient_id')['on'].transform('first') - self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first') + #self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first') # rajouter la moyenne du score on et off sur toutes les visites self.X['mean_on'] = self.X.groupby('patient_id')['on'].transform('mean') - self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean') + #self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean') # rajouter l'écart type du score on et off sur toutes les visites self.X['std_on'] = self.X.groupby('patient_id')['on'].transform('std') - self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std') + #self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std') # rajouter le temps depuis la dernière visite self.X['time_since_last_visit'] = self.X.groupby('patient_id')['age'].diff() @@ -170,4 +177,35 @@ class PreprocessData: print("Variable 'disease_duration' ajoutée avec succès.") return self.X - \ No newline at end of file + def imputer_valeurs_on(self): + """ + Impute les valeurs manquantes de on en utilisant une régression linéaire. + """ + X_copy = self.X.copy() + X_copy = X_copy.drop(['ledd','off','time_since_intake_on','time_since_intake_off','patient_id'],axis=1) + + X_copy_a_imputer = X_copy[X_copy['on'].isna()] + X_copy_connu = X_copy[~X_copy['on'].isna()] + + y_copy = X_copy_connu['on'] + + X_train, X_test, y_train, y_test = train_test_split(X_copy_connu.drop('on',axis=1), y_copy, test_size=0.2, random_state=42) + + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + mae = mean_absolute_error(y_test, y_pred) + print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}") + print("Imputation des valeurs manquantes de 'on'...") + X_copy.loc[X_copy['on'].isna(),'on'] = model.predict(X_copy_a_imputer.drop('on',axis=1)) + self.X['on']=X_copy['on'] + print("Les valeurs de 'on' ont été imputées avec succès.") + return self.X + +# test du code + + + + + -- GitLab