From 365b4776be9cecdd420900333de8d5b1717aac74 Mon Sep 17 00:00:00 2001
From: atebboun <75985994+encoreunpseudo@users.noreply.github.com>
Date: Sat, 29 Mar 2025 17:55:51 +0100
Subject: [PATCH] ok pour on

---
 .../__pycache__/preprocess4.cpython-310.pyc   | Bin 6232 -> 7082 bytes
 preprocess/prediction_on.ipynb                |   6 --
 preprocess/preprocess4.py                     |  58 +++++++++++++++---
 3 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc
index 23bcd7bad89c160503474ec19b0f7b2749e102af..b6a7cb001a2fe8ebb2b767265c01b7938ada5423 100644
GIT binary patch
delta 2732
zcmca%u*#e-pO=@5fq{XcRP{yrdijZb-xy0KvofZNq)4<dMDe9arbx9gMDeFc2Qz5O
zOwMD}vKC-qV5niNVTfm_VX9$>XG~!TW?0GSx02}=W5g}SN=@clEXAogX;Ewii6t5F
z5m35vB|{PK<Y1;eb`}N(1~x{n$(NY4WtkWl7@Qdx7>a#BhB7Q*SjfQ0P{Xi*5kya3
z!Kfi!!?1uI#HnG(;;3QB;!I--W+>udU|`VXntY!zqW+dpW^O@gNorAiSz=CVX;E=}
ze%>vSqQtEH(vtYJ)Wnj~qSW}3)ZBvNTfAkNMX5#c1&Jk@sd*(uAYT`8GcYi$WGv!g
zU|=ZXg%f-X3=Fr#3X1ZRQ;Un^ONtWniqrCoauZ83^YcVO&J|){VBlcnVB|1i;$Y+e
zVMY!{9>yxwfTC2WHkZVbL_H`Q&4@|dl9O4P?NvZ7xW$rElwVLJ0k<3M3XlVeKmn~Z
zIg5FTt_(<)5k!DwS2EsWtUz}yhfPjmZc<93o!Vq;7CTk2(X5FD1v!;i%?5cub@DtG
zD=tuUK%!vsBNktsU<OTAzgt|;K#I>y(PX^Eo}8aknwwW#1aikp#v(aT@N*P#O)h5j
zW6@+T;+cGkRazC~G>EIft`}ioV9)@C6)0@E7}*%P7{wUHHuJC*F=~L!DT4c_NEPNY
zg!Ws^Y4JJ6lb5lF`RRjfWAm$Gad&dm20Nk1fPsM_{iT{A#~PVi_Jj>ew*pyU#K6E%
zr7<~?Q+0Aahim}MST_F<57!`MgA3Pm-_BszPlUnwj;IE6q{Zih9XI&{r{rWMPGJRb
zpn&|R$yj6vQVbRV2aq}g1B3tMP|kRLQ;-BGO+)+-Rc;ItMO7XE5;B|om{Z*syB-se
z0#rSLAR#jb28Lf;sl_GnNWKjzk_81VC?dnTR2@N)Rip?KVFBldD7J|BoXp~qTa1}S
zY9K>E=4vt*f!zi63)nWWUj!K#7z`)x<%+eI0vW*SSELEj0g7jcEkz0-KDJaY&cMJB
z0+Ljk9MA2m07|&nQtU0(^wg60h_K0LxUCEwK<YsOrpXA-!MHV4Mu3z$O*ZH8P$}XB
zv9Y+}76-`el*E$6sL88%Y?%%*On%7|%gH9muz)#*V<F>YUtWDf2#;qWV+}(VLkh1A
z1BhP0n8LV_F@;Z(VFAlRut*BO4MU1RFoUMxWLZ8-Hb^i`j^WcYWKGK~DlUoQP0374
zi_g!C2Xk++<fbO(MX`fOP|3wyT#^#SR$P)2pPx7REuR7-*JM6^4Sg(*E)rv4V2A-_
zFi;FgvB@w?G0U)kpd8EQI{t4=(jdE|xU=)}%kw~mV`=IwmYmeI5>2Kex5@oNu9Bd1
z21#+Cm@NVsQv^zsos-`R1?hS*Ffc^16sK1B7MX*>%>qPNf(T^>28QReW`e*;`<Ggv
z5_EEouoUCh$?d|HjK3xy6!v5Enk*pVZwQLaTWp{LE-$gj8>H7ARI+jxr52}_#AoKE
zq*fGx^xtC5O)W}KEee`EK}13bln`&RLS(@?*JkoA5ktmXli!G_a(IB!mrN1QWJ%Ez
zMz`1_Kt*e2-YvF>_>$D(l3VPRFo8;l0Bdf3N@~t6w#xW|qSTaIlZC|$wCzDIfrL(O
zW^r+5UV3~%atV4+2!qOt$$4V6^~s>33RFjeFsL|VgH>BEz|~e3xWcMstYOSzN@qx6
zT*R2dl)~K01QMxXOkn|~qlFBT47E%(j0iqINQAkD86;oJQo>ThoW+#F0?}E+lERw8
zmcri462nqk&sxh?%U;V-!rIJG%UQ#@fUSnLhJ7IuBSQ`60`?lVg^aaaB^)&zHC)Y1
zwcI6~DI7K2DV(!e<}x)iGBT8Krf`91uv#XB6$?Sm<w7_og(Z#2h5^C{CG8Xz8<1Y^
zW=0-HaLNv5(BvtNo%~WzmnlDQvYL=#y(ViEOHOJ^$}Q&nw6rLZlFZ!H_~OjG<ka}g
zypqK1)cE|oC{Zk;X=#Y!p-3AP8EzmV6hwr9h;R@Q2_m9Ek)Y!Vs^Y=5uR=;{u>!QF
zpUf`AU$35@r>>``R|+Z*eGqaADX9t|DTVyJ5{1JnOAfD80JjqkuS_jgNGwZDRwyn_
zPCmS%SWlB1+!DCOlAK>q2`>MNf<QKd>ct{v5GxHtq=N`n5Rm~Q5<tn2r--K>luOtm
z;z63D1R!*LVm#Qc)S@UJs7P{tUS4SwTP0XsQ4Yw2To925BJx2*0f;CB5k(*ZoFl;m
zI9q}m9h*RT9aM~Ru?R5=FbOeoFex!vu`%&5axrl+3NUgosxk60axk(nRw<%2L!iz9
zl>*QvlAor)<ac7K_52{qBtZnIwNV6W78HSM?jlf$UIZ$ei@<dpsD)7E0#XMmc8WlG
zqX^_Eq)dZwEjt4P1E^tD3@ZOQKn*2M4qgss4rUHc4rLA|D3s%|C=viE)#SUyR*;yN
Xl2}}%GkJ=5p8(h-1i?4iNJ0$&fB{`V

delta 1971
zcmZ2we#3w-pO=@5fq{YH?ycwPpQR@9ePc|S%*vR`63n0}F}aRWOTdDGfuV-6h9RDz
zhN*@jo^kR5CJp`+hG2%3jDDKzw-_UCF;-4~%oHJ`$$X2YI5j6Nimf2ABqKfoN>{FA
zC=#6fm@!X~kAZ=KjggIsgPDVogNcKYXL1CS_T>M}?rNM23=FqeQi}2mRx%a|FfcF_
z3Bm~>5R03Efk9?+7RwS{F%Xv#M1Yj6WW2>#fu{HthfPjmZc<93ox)^mRy$R&;jD=T
z1v!;PGBAx0Jz%qW85kJkC(mQG;$mfBU~pz&U?`58{D{@pC5t14F@-sWHHEF0nUSG}
zA&ViHL6hC@7FR)HNoHzZNqlCCCgUyk<ouk{+`Qr<kjpd~izFBr7($A~85kHeIg2<a
zN3;2H7D<D6%tZo|*Rn~ggWQc|JwF2jgA&M6Q1EauvN3WoiZO~Y3T=MNR>Y_QGPDTp
z-691{_upbpi_g)U+|Ci^sR^=%&992Z-N{iK?4BYm1_p-omuiL_Yh-TO6E-N_6l8(U
z<Q<$klY=;=y<sM@`G<J81|geSxTgDd2E%?L%*?lenW@JDa?a$7T#}QSxr7zK0Ri%n
zCS#E{NDWv394DXv@tSPL6|b)klHdRlU|FbgUAO>Txi?72VDb?zbzkgy^gs&0hQRgs
zfP@Sf7#MzWr52aOBY6@UgB(SilP$Pa9hE@}!Li5!PM=Y15%D>h#U-~GGm8{Kav*ax
znTx>g0{aDQ8`vTq1_lP5$@92lT}43g!|GS00@4ABS%@t~QXoE-q<M?CEVC%JC?1;L
z{6WfOC!6v3YH@;?*mSU_r<TM=1Tru%d}ab?x5<LMqLUBtNE=#%w15Iqld%Yt<FVz1
zTVU;#!62o!llgc(v^ANEz;@x(!2z-`C9xzibaEB1Et3J`<ej{+>>QE|3s@F1OqS=<
z=LfNwQ@9o~f^tv_&tykFVO3rzkIx32uNE+-FfL?FfvAMZ@q<*l3qVy0Vo?c`6GD*_
zhS&|ZO~i&FMKqW}Q*3fQpCub4+$Znj(@SQ~PfLs9NXbk~i_cF>yTzK8SyWsS#S0P0
zi-&OeAl$Sx2>%vKZfas)6g!BF&(DkEfH2b1ZZQ{^q(re5m!yD%*}-&L+GJUN1xC)v
zcKjN`SYm3jKeL#8I4IwP5`q+)46_uo6r&8Y3>?cbZ=TBkjfqiZa*vQCrz5C{WX{Mh
zpL~%^bn;;#4M|W|fRrPk#8(7PYoL77GMPm<$it0+fgy^eIJLsJ$Pg5rMj*l%M96`P
zgjq8|;H3RaP>$4OisCE*Wq{($s?=LNMTvPS`ML4MC5a`erIY6gTQXKnz9H<#=r&nh
z#6KF89B#3J%IdttB6pC1pd#}YcTs9_YDs)%UP@|3kt)csthuR0>8VAaRC|jxFTW%&
z@fLGVe)280vc#Oy)Z!vgp1Q>f(O?2H$87RN5ktnP$;_gv94?@&B2y$V*+BG!Q51Vb
zd`VGaW*#WpfgBpeUI`PZj0XwaV$ID@NzI93tBfxwN==EHtR-fkZ3!|I6z5UgxtYbq
znR)5)1<57oK?W*zODDIA)iV2O@=Z39Qmq#T=>yd`MWCvo2vk59fiiKC6-WYHn%RL^
z_8<aW?10QbG8Yja><kPHAd`wgwE_nN2P2072QvpV2N#Dj2NMMH$Q6N9Y4Y7-D@e>s
YNi1e6Qk(ots!srH3W5-t+#;<80N&o7;{X5v

diff --git a/preprocess/prediction_on.ipynb b/preprocess/prediction_on.ipynb
index 99c13c0..028cda3 100644
--- a/preprocess/prediction_on.ipynb
+++ b/preprocess/prediction_on.ipynb
@@ -4046,7 +4046,6 @@
     "for metric_name, metric_value in metrics.items():\n",
     "    print(f\"{metric_name}: {metric_value:.4f}\")\n",
     "\n",
-    "# 1. Graphique des valeurs prédites vs réelles (jeu de test)\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.scatter(y_test, y_pred_test, alpha=0.5)\n",
     "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')\n",
@@ -4056,7 +4055,6 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 2. Histogramme des résidus\n",
     "residuals_test = y_test - y_pred_test\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.hist(residuals_test, bins=30, alpha=0.7, color='blue')\n",
@@ -4067,7 +4065,6 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 3. Graphique des résidus vs valeurs prédites\n",
     "plt.figure(figsize=(10, 6))\n",
     "plt.scatter(y_pred_test, residuals_test, alpha=0.5)\n",
     "plt.axhline(y=0, color='r', linestyle='--')\n",
@@ -4077,14 +4074,12 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 4. Visualisation de l'importance des features\n",
     "plt.figure(figsize=(12, 8))\n",
     "xgb.plot_importance(model, importance_type='weight', max_num_features=20)\n",
     "plt.title('Importance des features')\n",
     "plt.tight_layout()\n",
     "plt.show()\n",
     "\n",
-    "# 5. Graphique des valeurs prédites et réelles dans l'ordre des indices\n",
     "plt.figure(figsize=(12, 6))\n",
     "plt.plot(y_test.reset_index(drop=True), 'b-', label='Valeurs réelles')\n",
     "plt.plot(y_pred_test, 'r-', label='Valeurs prédites')\n",
@@ -4095,7 +4090,6 @@
     "plt.grid(True)\n",
     "plt.show()\n",
     "\n",
-    "# 6. Courbe d'apprentissage si disponible\n",
     "if hasattr(model, 'evals_result'):\n",
     "    results = model.evals_result()\n",
     "    if results:\n",
diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py
index ddd88f6..623ab72 100644
--- a/preprocess/preprocess4.py
+++ b/preprocess/preprocess4.py
@@ -15,15 +15,21 @@ class PreprocessData:
         self.y = path_y
   
 
+        
+        
+        
+    def process_transformation(self):
         self.enlever_index()
         self.remplir_gene()
-        #self.virer_patient()
+       
         self.encoder_cohort()
         self.imputer_age_at_diagnosis()
         #self.rajout_feature_temps()
         self.encoder_patient()
-        
-        
+        self.imputer_valeurs_on()
+        self.rajout_feature_temps()
+        self.virer_patient_et_autre()
+        return self.X,self.y
     
     def enlever_index(self):
         self.X.drop('Index',axis=1,inplace=True)
@@ -37,7 +43,6 @@ class PreprocessData:
         X_patient=vectorizer.fit_transform(X_patient)
         X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out())
         self.X=pd.concat([self.X,X_patient],axis=1)
-        self.X.drop('patient_id',axis=1,inplace=True)
         return self.X
             
         
@@ -76,8 +81,10 @@ class PreprocessData:
         self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x))
         self.X.drop('gene',axis=1,inplace=True)
         return self.X
-    def virer_patient(self):
+    def virer_patient_et_autre(self):
         self.X.drop('patient_id',axis=1,inplace=True)
+        self.X.drop(['time_since_intake_on','time_since_intake_off'],axis=1,inplace=True)
+        return self.X
     def get_X(self):
         return self.X
     def get_y(self):
@@ -97,19 +104,19 @@ class PreprocessData:
 
         # rajouter la progression du score on et off depuis la dernière visite
         self.X['diff_on'] = self.X.groupby('patient_id')['on'].diff()
-        self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff()
+        #self.X['diff_off'] = self.X.groupby('patient_id')['off'].diff()
 
         # rajouter la progression du score on et off depuis la première visite
         self.X['diff_on_first'] = self.X.groupby('patient_id')['on'].transform('first')
-        self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first')
+        #self.X['diff_off_first'] = self.X.groupby('patient_id')['off'].transform('first')
 
         # rajouter la moyenne du score on et off sur toutes les visites
         self.X['mean_on'] = self.X.groupby('patient_id')['on'].transform('mean')
-        self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean')
+        #self.X['mean_off'] = self.X.groupby('patient_id')['off'].transform('mean')
 
         # rajouter l'écart type du score on et off sur toutes les visites
         self.X['std_on'] = self.X.groupby('patient_id')['on'].transform('std')
-        self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std')
+        #self.X['std_off'] = self.X.groupby('patient_id')['off'].transform('std')
 
         # rajouter le temps depuis la dernière visite
         self.X['time_since_last_visit'] = self.X.groupby('patient_id')['age'].diff()
@@ -170,4 +177,35 @@ class PreprocessData:
         print("Variable 'disease_duration' ajoutée avec succès.")
         
         return self.X
-        
\ No newline at end of file
+    def imputer_valeurs_on(self):
+        """
+        Impute les valeurs manquantes de on en utilisant une régression linéaire.
+        """
+        X_copy = self.X.copy()
+        X_copy = X_copy.drop(['ledd','off','time_since_intake_on','time_since_intake_off','patient_id'],axis=1)
+
+        X_copy_a_imputer = X_copy[X_copy['on'].isna()]
+        X_copy_connu = X_copy[~X_copy['on'].isna()]
+
+        y_copy = X_copy_connu['on']
+
+        X_train, X_test, y_train, y_test = train_test_split(X_copy_connu.drop('on',axis=1), y_copy, test_size=0.2, random_state=42)
+
+        model = LinearRegression()
+        model.fit(X_train, y_train)
+
+        y_pred = model.predict(X_test)
+        mae = mean_absolute_error(y_test, y_pred)
+        print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}")
+        print("Imputation des valeurs manquantes de 'on'...")
+        X_copy.loc[X_copy['on'].isna(),'on'] = model.predict(X_copy_a_imputer.drop('on',axis=1))
+        self.X['on']=X_copy['on']
+        print("Les valeurs de 'on' ont été imputées avec succès.")
+        return self.X
+
+# test du code
+
+    
+
+
+
-- 
GitLab