From 7f0f0a49fae66bce65fd04fa1774f9b40eaa512e Mon Sep 17 00:00:00 2001
From: atebboun <amel.tebboune@etu.ec-lyon.fr>
Date: Wed, 26 Mar 2025 18:33:42 +0100
Subject: [PATCH] imputage de l'age au diagnostique
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

en gros on fait une régression linéaire ca marche pas trop mal 1.33 de MAE
---
 .../__pycache__/preprocess4.cpython-310.pyc   | Bin 2628 -> 2502 bytes
 preprocess/analysedonnees.ipynb               |  87 +++++++++---
 preprocess/preprocess4.py                     | 107 ++++++++++++---
 preprocess/valeursoff.ipynb                   | 125 +++++++++++++++---
 4 files changed, 268 insertions(+), 51 deletions(-)

diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc
index 1f8db05ce3b312eec7248569d0e514d5e63154d7..566ddc7aa698080e4165633ef5f7689976e24a77 100644
GIT binary patch
delta 1174
zcmX>ia!gn|pO=@5fq{WR?cL+_n`{gWk3k${%)-FH;K0DZQ0y>K+ng_jZ6RYbBO^l!
zV=#jz`@}p2Mw5yCdfZ$s3{mVU+`$Z*JQFWm7KvgjNG!>SkATvZD;bKoCvRumWLXR{
zErlVNVI`wq5jO(^!%D^?ko90Tgy3OdV7SFymY9=TT2vgLpOz*JvV8I)CgaIA%xa8L
zlarYZB>5Q_7$C|)h7^JH6mc*xFo;cF!)(e2vV@g^fq@yM^TOo!%x3ihwM;dPHB6EW
zwag_fHO$S7C9K&@MVcubAW<8JLdhD&8ip)}6h<2c2rbF5fUTV|jWLBYg{y^QAw-rN
zMV29rF@*<7mKQ~qDUC6O4<@UW#gM`ek<MaIVN4N75lj*4Wo86h7R;b2>=!lp3`>Y%
z6`M~`khhU`ks!!}j35H!!Ig}+7%R~Hdy6?OK1X_TAZwVX7y|=?CYxUsi@TE}RI4}x
z14H^tHA9XyGPmrD;06^*z>I{L1`0TkLFq~$ap}n)Slzv0rm^{lc(?{3n^(A|`*sGy
zej?1v*Mym;!<JtJa?Iq5ETWU&unB4Lfn-3Q0=q?%u}BIo0Co+BO-^ENQc9wo`D6|D
zczrpLEQTH#kSN#?ke(uW1_lNTkdVUUZS3mx*!9SQ6rk#{1PLiHFfdecr52aOBY8Na
zNC*_D97UjXbc?MdKPR)e<Q8jDVqSXcE#{omyjv_Kl?AD{SkrRy6H9KfCKeRrR2GSX
z^svN$lXMhYL_Am%V`h;g$R3c@n#@IDr-S_rb`aS8oD2*MAT`Axm-8`lFmkYSunVwr
zF!C^QFflc7G0K6V5F;DoWIkrS$rm_8)InjQ$q05|5ti(5i#0v9BtF7sGB>A(8!yOy
z0T2PoBt;<WiZno6keeVW0n7qBhKYfJfgO}{Ks*ix4n_`c4rUH!9_7iiIW6U1f}E(y
zd5f(eF)t;txCoTTZn5T-<`z`mV#+I+{GL<77G(4-mW<TI6eP3Z7Tn^n$<0qG%}KQb
RCCg%%EliAjEL@gii~y8y-l6~i

delta 1265
zcmX>md_+V$pO=@5fq{Xc@!I2beRc+h#~=<eW@cbuaA06yD7KrZZO*fhp_!49A%!uR
zL6dD_fdZq=#EE*sY%L5?tSMY6+${`I>?u6K44S+XPhA$d#a575k`W&Pr7Kr56!A{p
z$hgV!GYbO)LkdGM!%9ZKB3=dthLwy(AV+}N5Q2|^f#DW+Sz=CVX;E=}ep;F^0|Ue7
z$#a;DCkJqF7#9gJFfbGef(VfQA|W^n%mwK#;$&c85Ce&eFfcGwNl%`~COz4nLvZqW
zCN;)elfN+;NMh5)!N9;EIoY1sl#i8xfdPa;0Z@Exauc&zJ%24z4Py<HBttE84RZ-g
z7Hc+Bkwyv!NX&+zP_l-xh9Qd~h0%r~ogsyD5o0Y2gfGdkfUTV|jS(C~919_;cx)Ju
zWEs*JQ+QLjTVS$$D6&jxj4Avn+`4dS0f=-KdkSNUV2V(Ra4$0>*tlQ@O%cCaEa|Cv
zsgoVqL?;)sSQ-9e^9c&_HqtHvhXx}kNI;>qlJOQ}1thdV+#*n*-C|CQ&ykyak0s1g
zhJk@Wlg;lJi@TE}RI4ll14H^tHA9XyGPmrD;06`R!;FNO1~VvK6(p`SnV(H(@=8`|
zZ<vW}{vjT&LC9tnuIav=!LXkQGxK#|X6mVcoTJHFBnOIKZohm5-_nxAlFa<PB4v;o
zu;sFo?bx)m5a|TuQccDpMUVo7r#WnL5_6MM674J}PhyMLR|m;r=urWQ!u1qsFfcG!
zfrK<CE3m7(Vb`MuQh=(*8YH9%axzzHaY;N<P=pkTPwr)xuV>0DxW(+3=y!`Xv7jKQ
zvPcRP5v<@0a*Hh@J}0xd<Q8LQkpd`(utReQIN&vzi@=Tq`wnb1C?>%902K7aART;+
z9E=?79P9$@9E@Cy9E?m2AX1JI1ceyc7?T+%J8~$igM6>aSOiM`*z(pb*7Vep_z1hn
z-5ees{2(8JvM1Q4B2We@G6so(J&qv2j$vY8U|<K8CLkUM0|z4qHwQBa@+eJ~<g}F5
z<OG!#c`1p-MWB3li#4w_x1h2}mw|y{av`ULEy&<oEE%bZDPS)n5g-F^aoFVMr<CTT
X+JO>hF%JU+1IP|0Mm`oUYY|2Ol_cy$

diff --git a/preprocess/analysedonnees.ipynb b/preprocess/analysedonnees.ipynb
index 7b5e57e..59d3a0e 100644
--- a/preprocess/analysedonnees.ipynb
+++ b/preprocess/analysedonnees.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -321,26 +321,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    35010.000000\n",
+       "mean       638.102228\n",
+       "std        219.443182\n",
+       "min         50.000000\n",
+       "25%        481.000000\n",
+       "50%        611.000000\n",
+       "75%        765.000000\n",
+       "max       1796.000000\n",
+       "Name: ledd, dtype: float64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X['ledd'].describe()"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "ename": "TypeError",
-     "evalue": "unsupported operand type(s) for /: 'list' and 'int'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgene\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue_counts\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n",
-      "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'list' and 'int'"
-     ]
+     "data": {
+      "text/plain": [
+       "gene\n",
+       "No Mutation    0.320324\n",
+       "LRRK2+         0.167815\n",
+       "GBA+           0.145622\n",
+       "OTHER+         0.043271\n",
+       "Name: count, dtype: float64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -349,10 +374,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "df_gr=X.groupby('patient_id')['ledd'].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_non=df_gr[df_gr==0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nombre de patients n'ayant pas de valeurs pour ledd :  808\n",
+      "Nombre de patients au total :  6971\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Nombre de patients n'ayant pas de valeurs pour ledd : \", df_non.shape[0])\n",
+    "print(\"Nombre de patients au total : \", df_gr.shape[0])"
+   ]
   }
  ],
  "metadata": {
diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py
index c94f954..0a9380c 100644
--- a/preprocess/preprocess4.py
+++ b/preprocess/preprocess4.py
@@ -1,29 +1,43 @@
 import pandas as pd
 import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_absolute_error, r2_score
 '''
 fichier pour centraliser toutes les transformations de données
 
 '''
 class PreprocessData:
     def __init__(self,path_X,path_y):
-        self.X = pd.read_csv(path_X)
+        self.X=pd.read_csv(path_X)
+
         self.y = pd.read_csv(path_y)
-        print("coucou")
-        self.virer_patient()
-        print("coucou 2")
+  
+
+        self.enlever_index()
         self.remplir_gene()
-        print("Coucou 3")
+        #self.virer_patient()
+        self.encoder_cohort()
+        self.imputer_age_at_diagnosis()
         
         
     
-    def valeurs_off(self):
-        pass
-
-    def remplir_gene(self):
-        pass
-
-    def virer_patient(self):
+    def enlever_index(self):
+        self.X.drop('Index',axis=1,inplace=True)
+        
+    def encoder_cohort(self):
+        self.X['cohort']=self.X['cohort'].apply(lambda x:1 if x=='A' else 0)
+        
+    def encoder_patient(self):
+        X_patient=self.X['patient_id']
+        vectorizer = CountVectorizer()
+        X_patient=vectorizer.fit_transform(X_patient)
+        X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out())
+        self.X=pd.concat([self.X,X_patient],axis=1)
         self.X.drop('patient_id',axis=1,inplace=True)
+        return self.X
+            
         
     def remplir_gene(self):
         X_list=self.X['gene'].tolist()
@@ -55,15 +69,78 @@ class PreprocessData:
   
                 X_list[i]='Inconnu'
         self.X['gene']=X_list
-        valeurs=['LRRK2+','No Mutation','GBA+','OTHER+','Inconnu']
         self.X['est_LRRK2+']=self.X['gene'].apply(lambda x: f_l(x))
         self.X['est_GBA+']=self.X['gene'].apply(lambda x: f_g(x))
         self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x))
         self.X.drop('gene',axis=1,inplace=True)
         return self.X
+    def virer_patient(self):
+        self.X.drop('patient_id',axis=1,inplace=True)
     def get_X(self):
         return self.X
+    def get_y(self):
+        return self.y
+    def get_data(self):
+        return self.X,self.y
+    def rajout_feature_temps(self):
+        '''
+        Pour capturer la relation temporelle
+        '''
+        pass
+    def imputer_age_at_diagnosis(self):
+        """
+        Impute les valeurs manquantes de age_at_diagnosis en utilisant une régression linéaire.
+        """
 
         
-preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')
-print(preprocess4.get_X().head(30))
\ No newline at end of file
+        patients_values = self.X.dropna(subset=['age_at_diagnosis']).groupby('patient_id')['age_at_diagnosis'].first().reset_index()
+        patients_values.columns = ['patient_id', 'known_value']
+        
+        temp_df = self.X.merge(patients_values, on='patient_id', how='left')
+        
+        mask = temp_df['age_at_diagnosis'].isna() & temp_df['known_value'].notna()
+        self.X.loc[mask.values, 'age_at_diagnosis'] = temp_df.loc[mask, 'known_value'].values
+        
+        patients_sans_diagnostic = self.X.groupby('patient_id')['age_at_diagnosis'].apply(
+            lambda x: x.isna().all())
+        patients_sans_diagnostic = patients_sans_diagnostic[patients_sans_diagnostic].index.tolist()
+        
+        nb_patients_sans_diagnostic = len(patients_sans_diagnostic)
+        total_patients = len(self.X['patient_id'].unique())
+        pourcentage = (nb_patients_sans_diagnostic / total_patients) * 100
+
+        
+        if nb_patients_sans_diagnostic > 0:
+            patients_avec_diagnostic = ~self.X['patient_id'].isin(patients_sans_diagnostic)
+            df_known = self.X[patients_avec_diagnostic].dropna(subset=['age_at_diagnosis'])
+            df_known_unique = df_known.drop_duplicates('patient_id')
+            
+            features = ['age', 'sexM', 'est_LRRK2+', 'est_GBA+', 'est_OTHER+', 'cohort']
+            X_known = df_known_unique[features]
+            y_known = df_known_unique['age_at_diagnosis']
+            
+            X_train, X_test, y_train, y_test = train_test_split(
+                X_known, y_known, test_size=0.2, random_state=42)
+            
+            model = LinearRegression()
+            model.fit(X_train, y_train)
+            
+            y_pred = model.predict(X_test)
+            mae = mean_absolute_error(y_test, y_pred)
+            r2 = r2_score(y_test, y_pred)
+            print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}, R²: {r2:.2f}")
+            
+            df_unknown = self.X[self.X['patient_id'].isin(patients_sans_diagnostic)].drop_duplicates('patient_id')
+            X_unknown = df_unknown[features]
+            predicted_ages = model.predict(X_unknown)
+            
+            for i, patient_id in enumerate(df_unknown['patient_id']):
+                self.X.loc[self.X['patient_id'] == patient_id, 'age_at_diagnosis'] = predicted_ages[i]
+        
+        missing_pct = self.X['age_at_diagnosis'].isna().mean() * 100
+        print(f"Pourcentage de valeurs manquantes après imputation: {missing_pct:.2f}%")
+        self.X['disease_duration'] = self.X['age'] - self.X['age_at_diagnosis']
+        print("Variable 'disease_duration' ajoutée avec succès.")
+        
+        return self.X
+        
diff --git a/preprocess/valeursoff.ipynb b/preprocess/valeursoff.ipynb
index 6858993..5d983ab 100644
--- a/preprocess/valeursoff.ipynb
+++ b/preprocess/valeursoff.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,8 @@
     "import seaborn as sns\n",
     "\n",
     "X = pd.read_csv('data/X_train_6ZIKlTY.csv')\n",
-    "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n"
+    "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n",
+    "X_chall=pd.read_csv('data/X_test_oiZ2ukx.csv')\n"
    ]
   },
   {
@@ -162,53 +163,137 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Index</th>\n",
+       "      <th>patient_id</th>\n",
+       "      <th>cohort</th>\n",
+       "      <th>sexM</th>\n",
+       "      <th>gene</th>\n",
+       "      <th>age_at_diagnosis</th>\n",
+       "      <th>age</th>\n",
+       "      <th>ledd</th>\n",
+       "      <th>time_since_intake_on</th>\n",
+       "      <th>time_since_intake_off</th>\n",
+       "      <th>on</th>\n",
+       "      <th>off</th>\n",
+       "      <th>time_since_diagnosis</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "23407"
+       "Empty DataFrame\n",
+       "Columns: [Index, patient_id, cohort, sexM, gene, age_at_diagnosis, age, ledd, time_since_intake_on, time_since_intake_off, on, off, time_since_diagnosis]\n",
+       "Index: []"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "X['off'].isna().sum()"
+    "v=X_chall[X_chall['off'].isna()]\n",
+    "v=v[v['on'].isna()]\n",
+    "v.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from preprocess import PreprocessData\n",
-    "\n",
-    "preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')"
+    "from preprocess4 import PreprocessData\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'PreprocessData' object has no attribute 'get_X'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpreprocess4\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_X\u001b[49m()\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'PreprocessData' object has no attribute 'get_X'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "coucou\n",
+      "coucou 2\n",
+      "Coucou 3\n"
      ]
     }
    ],
-   "source": []
+   "source": [
+    "x_2=PreprocessData('data/X_train_6ZIKlTY.csv','data/X_train_6ZIKlTY.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index                    0.000000\n",
+       "cohort                   0.000000\n",
+       "sexM                     0.000000\n",
+       "age_at_diagnosis         0.050897\n",
+       "age                      0.000000\n",
+       "ledd                     0.370358\n",
+       "time_since_intake_on     0.466522\n",
+       "time_since_intake_off    0.788231\n",
+       "on                       0.299606\n",
+       "off                      0.420966\n",
+       "est_LRRK2+               0.000000\n",
+       "est_GBA+                 0.000000\n",
+       "est_OTHER+               0.000000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x_2.get_X().isna().mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_2.groupby()"
+   ]
   }
  ],
  "metadata": {
-- 
GitLab