diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc
index 1f8db05ce3b312eec7248569d0e514d5e63154d7..566ddc7aa698080e4165633ef5f7689976e24a77 100644
Binary files a/preprocess/__pycache__/preprocess4.cpython-310.pyc and b/preprocess/__pycache__/preprocess4.cpython-310.pyc differ
diff --git a/preprocess/analysedonnees.ipynb b/preprocess/analysedonnees.ipynb
index 7b5e57eec13abdb23eaa7fae51127d633080d97d..59d3a0e471ea66a8cd20cc7e6952e3985ad06b86 100644
--- a/preprocess/analysedonnees.ipynb
+++ b/preprocess/analysedonnees.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -321,26 +321,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    35010.000000\n",
+       "mean       638.102228\n",
+       "std        219.443182\n",
+       "min         50.000000\n",
+       "25%        481.000000\n",
+       "50%        611.000000\n",
+       "75%        765.000000\n",
+       "max       1796.000000\n",
+       "Name: ledd, dtype: float64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X['ledd'].describe()"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "ename": "TypeError",
-     "evalue": "unsupported operand type(s) for /: 'list' and 'int'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgene\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue_counts\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n",
-      "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'list' and 'int'"
-     ]
+     "data": {
+      "text/plain": [
+       "gene\n",
+       "No Mutation    0.320324\n",
+       "LRRK2+         0.167815\n",
+       "GBA+           0.145622\n",
+       "OTHER+         0.043271\n",
+       "Name: count, dtype: float64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -349,10 +374,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "df_gr=X.groupby('patient_id')['ledd'].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_non=df_gr[df_gr==0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nombre de patients n'ayant pas de valeurs pour ledd :  808\n",
+      "Nombre de patients au total :  6971\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Nombre de patients n'ayant pas de valeurs pour ledd : \", df_non.shape[0])\n",
+    "print(\"Nombre de patients au total : \", df_gr.shape[0])"
+   ]
   }
  ],
  "metadata": {
diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py
index c94f954edc44301ffcc20c4338eded1eeb25c3d0..0a9380c902c885caa4e15a80495d0aa021deae18 100644
--- a/preprocess/preprocess4.py
+++ b/preprocess/preprocess4.py
@@ -1,29 +1,43 @@
 import pandas as pd
 import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_absolute_error, r2_score
 '''
 fichier pour centraliser toutes les transformations de données
 
 '''
 class PreprocessData:
     def __init__(self,path_X,path_y):
-        self.X = pd.read_csv(path_X)
+        self.X=pd.read_csv(path_X)
+
         self.y = pd.read_csv(path_y)
-        print("coucou")
-        self.virer_patient()
-        print("coucou 2")
+  
+
+        self.enlever_index()
         self.remplir_gene()
-        print("Coucou 3")
+        #self.virer_patient()
+        self.encoder_cohort()
+        self.imputer_age_at_diagnosis()
         
         
     
-    def valeurs_off(self):
-        pass
-
-    def remplir_gene(self):
-        pass
-
-    def virer_patient(self):
+    def enlever_index(self):
+        self.X.drop('Index',axis=1,inplace=True)
+        
+    def encoder_cohort(self):
+        self.X['cohort']=self.X['cohort'].apply(lambda x:1 if x=='A' else 0)
+        
+    def encoder_patient(self):
+        X_patient=self.X['patient_id']
+        vectorizer = CountVectorizer()
+        X_patient=vectorizer.fit_transform(X_patient)
+        X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out())
+        self.X=pd.concat([self.X,X_patient],axis=1)
         self.X.drop('patient_id',axis=1,inplace=True)
+        return self.X
+            
         
     def remplir_gene(self):
         X_list=self.X['gene'].tolist()
@@ -55,15 +69,78 @@ class PreprocessData:
   
                 X_list[i]='Inconnu'
         self.X['gene']=X_list
-        valeurs=['LRRK2+','No Mutation','GBA+','OTHER+','Inconnu']
         self.X['est_LRRK2+']=self.X['gene'].apply(lambda x: f_l(x))
         self.X['est_GBA+']=self.X['gene'].apply(lambda x: f_g(x))
         self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x))
         self.X.drop('gene',axis=1,inplace=True)
         return self.X
+    def virer_patient(self):
+        self.X.drop('patient_id',axis=1,inplace=True)
     def get_X(self):
         return self.X
+    def get_y(self):
+        return self.y
+    def get_data(self):
+        return self.X,self.y
+    def rajout_feature_temps(self):
+        '''
+        Pour capturer la relation temporelle
+        '''
+        pass
+    def imputer_age_at_diagnosis(self):
+        """
+        Impute les valeurs manquantes de age_at_diagnosis en utilisant une régression linéaire.
+        """
 
         
-preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')
-print(preprocess4.get_X().head(30))
\ No newline at end of file
+        patients_values = self.X.dropna(subset=['age_at_diagnosis']).groupby('patient_id')['age_at_diagnosis'].first().reset_index()
+        patients_values.columns = ['patient_id', 'known_value']
+        
+        temp_df = self.X.merge(patients_values, on='patient_id', how='left')
+        
+        mask = temp_df['age_at_diagnosis'].isna() & temp_df['known_value'].notna()
+        self.X.loc[mask.values, 'age_at_diagnosis'] = temp_df.loc[mask, 'known_value'].values
+        
+        patients_sans_diagnostic = self.X.groupby('patient_id')['age_at_diagnosis'].apply(
+            lambda x: x.isna().all())
+        patients_sans_diagnostic = patients_sans_diagnostic[patients_sans_diagnostic].index.tolist()
+        
+        nb_patients_sans_diagnostic = len(patients_sans_diagnostic)
+        total_patients = len(self.X['patient_id'].unique())
+        pourcentage = (nb_patients_sans_diagnostic / total_patients) * 100
+
+        
+        if nb_patients_sans_diagnostic > 0:
+            patients_avec_diagnostic = ~self.X['patient_id'].isin(patients_sans_diagnostic)
+            df_known = self.X[patients_avec_diagnostic].dropna(subset=['age_at_diagnosis'])
+            df_known_unique = df_known.drop_duplicates('patient_id')
+            
+            features = ['age', 'sexM', 'est_LRRK2+', 'est_GBA+', 'est_OTHER+', 'cohort']
+            X_known = df_known_unique[features]
+            y_known = df_known_unique['age_at_diagnosis']
+            
+            X_train, X_test, y_train, y_test = train_test_split(
+                X_known, y_known, test_size=0.2, random_state=42)
+            
+            model = LinearRegression()
+            model.fit(X_train, y_train)
+            
+            y_pred = model.predict(X_test)
+            mae = mean_absolute_error(y_test, y_pred)
+            r2 = r2_score(y_test, y_pred)
+            print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}, R²: {r2:.2f}")
+            
+            df_unknown = self.X[self.X['patient_id'].isin(patients_sans_diagnostic)].drop_duplicates('patient_id')
+            X_unknown = df_unknown[features]
+            predicted_ages = model.predict(X_unknown)
+            
+            for i, patient_id in enumerate(df_unknown['patient_id']):
+                self.X.loc[self.X['patient_id'] == patient_id, 'age_at_diagnosis'] = predicted_ages[i]
+        
+        missing_pct = self.X['age_at_diagnosis'].isna().mean() * 100
+        print(f"Pourcentage de valeurs manquantes après imputation: {missing_pct:.2f}%")
+        self.X['disease_duration'] = self.X['age'] - self.X['age_at_diagnosis']
+        print("Variable 'disease_duration' ajoutée avec succès.")
+        
+        return self.X
+        
diff --git a/preprocess/valeursoff.ipynb b/preprocess/valeursoff.ipynb
index 6858993cbb29384665b208ac10e631582675f19b..5d983abf11fb938b16ce918488eec00a01ed429b 100644
--- a/preprocess/valeursoff.ipynb
+++ b/preprocess/valeursoff.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,8 @@
     "import seaborn as sns\n",
     "\n",
     "X = pd.read_csv('data/X_train_6ZIKlTY.csv')\n",
-    "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n"
+    "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n",
+    "X_chall=pd.read_csv('data/X_test_oiZ2ukx.csv')\n"
    ]
   },
   {
@@ -162,53 +163,137 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Index</th>\n",
+       "      <th>patient_id</th>\n",
+       "      <th>cohort</th>\n",
+       "      <th>sexM</th>\n",
+       "      <th>gene</th>\n",
+       "      <th>age_at_diagnosis</th>\n",
+       "      <th>age</th>\n",
+       "      <th>ledd</th>\n",
+       "      <th>time_since_intake_on</th>\n",
+       "      <th>time_since_intake_off</th>\n",
+       "      <th>on</th>\n",
+       "      <th>off</th>\n",
+       "      <th>time_since_diagnosis</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "23407"
+       "Empty DataFrame\n",
+       "Columns: [Index, patient_id, cohort, sexM, gene, age_at_diagnosis, age, ledd, time_since_intake_on, time_since_intake_off, on, off, time_since_diagnosis]\n",
+       "Index: []"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "X['off'].isna().sum()"
+    "v=X_chall[X_chall['off'].isna()]\n",
+    "v=v[v['on'].isna()]\n",
+    "v.head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from preprocess import PreprocessData\n",
-    "\n",
-    "preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')"
+    "from preprocess4 import PreprocessData\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'PreprocessData' object has no attribute 'get_X'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpreprocess4\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_X\u001b[49m()\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'PreprocessData' object has no attribute 'get_X'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "coucou\n",
+      "coucou 2\n",
+      "Coucou 3\n"
      ]
     }
    ],
-   "source": []
+   "source": [
+    "x_2=PreprocessData('data/X_train_6ZIKlTY.csv','data/X_train_6ZIKlTY.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index                    0.000000\n",
+       "cohort                   0.000000\n",
+       "sexM                     0.000000\n",
+       "age_at_diagnosis         0.050897\n",
+       "age                      0.000000\n",
+       "ledd                     0.370358\n",
+       "time_since_intake_on     0.466522\n",
+       "time_since_intake_off    0.788231\n",
+       "on                       0.299606\n",
+       "off                      0.420966\n",
+       "est_LRRK2+               0.000000\n",
+       "est_GBA+                 0.000000\n",
+       "est_OTHER+               0.000000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x_2.get_X().isna().mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_2.groupby()"
+   ]
   }
  ],
  "metadata": {