From 7f0f0a49fae66bce65fd04fa1774f9b40eaa512e Mon Sep 17 00:00:00 2001 From: atebboun <amel.tebboune@etu.ec-lyon.fr> Date: Wed, 26 Mar 2025 18:33:42 +0100 Subject: [PATCH] imputage de l'age au diagnostique MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit en gros on fait une régression linéaire ca marche pas trop mal 1.33 de MAE --- .../__pycache__/preprocess4.cpython-310.pyc | Bin 2628 -> 2502 bytes preprocess/analysedonnees.ipynb | 87 +++++++++--- preprocess/preprocess4.py | 107 ++++++++++++--- preprocess/valeursoff.ipynb | 125 +++++++++++++++--- 4 files changed, 268 insertions(+), 51 deletions(-) diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc index 1f8db05ce3b312eec7248569d0e514d5e63154d7..566ddc7aa698080e4165633ef5f7689976e24a77 100644 GIT binary patch delta 1174 zcmX>ia!gn|pO=@5fq{WR?cL+_n`{gWk3k${%)-FH;K0DZQ0y>K+ng_jZ6RYbBO^l! zV=#jz`@}p2Mw5yCdfZ$s3{mVU+`$Z*JQFWm7KvgjNG!>SkATvZD;bKoCvRumWLXR{ zErlVNVI`wq5jO(^!%D^?ko90Tgy3OdV7SFymY9=TT2vgLpOz*JvV8I)CgaIA%xa8L zlarYZB>5Q_7$C|)h7^JH6mc*xFo;cF!)(e2vV@g^fq@yM^TOo!%x3ihwM;dPHB6EW zwag_fHO$S7C9K&@MVcubAW<8JLdhD&8ip)}6h<2c2rbF5fUTV|jWLBYg{y^QAw-rN zMV29rF@*<7mKQ~qDUC6O4<@UW#gM`ek<MaIVN4N75lj*4Wo86h7R;b2>=!lp3`>Y% z6`M~`khhU`ks!!}j35H!!Ig}+7%R~Hdy6?OK1X_TAZwVX7y|=?CYxUsi@TE}RI4}x z14H^tHA9XyGPmrD;06^*z>I{L1`0TkLFq~$ap}n)Slzv0rm^{lc(?{3n^(A|`*sGy zej?1v*Mym;!<JtJa?Iq5ETWU&unB4Lfn-3Q0=q?%u}BIo0Co+BO-^ENQc9wo`D6|D zczrpLEQTH#kSN#?ke(uW1_lNTkdVUUZS3mx*!9SQ6rk#{1PLiHFfdecr52aOBY8Na zNC*_D97UjXbc?MdKPR)e<Q8jDVqSXcE#{omyjv_Kl?AD{SkrRy6H9KfCKeRrR2GSX z^svN$lXMhYL_Am%V`h;g$R3c@n#@IDr-S_rb`aS8oD2*MAT`Axm-8`lFmkYSunVwr zF!C^QFflc7G0K6V5F;DoWIkrS$rm_8)InjQ$q05|5ti(5i#0v9BtF7sGB>A(8!yOy z0T2PoBt;<WiZno6keeVW0n7qBhKYfJfgO}{Ks*ix4n_`c4rUH!9_7iiIW6U1f}E(y zd5f(eF)t;txCoTTZn5T-<`z`mV#+I+{GL<77G(4-mW<TI6eP3Z7Tn^n$<0qG%}KQb RCCg%%EliAjEL@gii~y8y-l6~i delta 1265 zcmX>md_+V$pO=@5fq{Xc@!I2beRc+h#~=<eW@cbuaA06yD7KrZZO*fhp_!49A%!uR zL6dD_fdZq=#EE*sY%L5?tSMY6+${`I>?u6K44S+XPhA$d#a575k`W&Pr7Kr56!A{p z$hgV!GYbO)LkdGM!%9ZKB3=dthLwy(AV+}N5Q2|^f#DW+Sz=CVX;E=}ep;F^0|Ue7 z$#a;DCkJqF7#9gJFfbGef(VfQA|W^n%mwK#;$&c85Ce&eFfcGwNl%`~COz4nLvZqW zCN;)elfN+;NMh5)!N9;EIoY1sl#i8xfdPa;0Z@Exauc&zJ%24z4Py<HBttE84RZ-g z7Hc+Bkwyv!NX&+zP_l-xh9Qd~h0%r~ogsyD5o0Y2gfGdkfUTV|jS(C~919_;cx)Ju zWEs*JQ+QLjTVS$$D6&jxj4Avn+`4dS0f=-KdkSNUV2V(Ra4$0>*tlQ@O%cCaEa|Cv zsgoVqL?;)sSQ-9e^9c&_HqtHvhXx}kNI;>qlJOQ}1thdV+#*n*-C|CQ&ykyak0s1g zhJk@Wlg;lJi@TE}RI4ll14H^tHA9XyGPmrD;06`R!;FNO1~VvK6(p`SnV(H(@=8`| zZ<vW}{vjT&LC9tnuIav=!LXkQGxK#|X6mVcoTJHFBnOIKZohm5-_nxAlFa<PB4v;o zu;sFo?bx)m5a|TuQccDpMUVo7r#WnL5_6MM674J}PhyMLR|m;r=urWQ!u1qsFfcG! zfrK<CE3m7(Vb`MuQh=(*8YH9%axzzHaY;N<P=pkTPwr)xuV>0DxW(+3=y!`Xv7jKQ zvPcRP5v<@0a*Hh@J}0xd<Q8LQkpd`(utReQIN&vzi@=Tq`wnb1C?>%902K7aART;+ z9E=?79P9$@9E@Cy9E?m2AX1JI1ceyc7?T+%J8~$igM6>aSOiM`*z(pb*7Vep_z1hn z-5ees{2(8JvM1Q4B2We@G6so(J&qv2j$vY8U|<K8CLkUM0|z4qHwQBa@+eJ~<g}F5 z<OG!#c`1p-MWB3li#4w_x1h2}mw|y{av`ULEy&<oEE%bZDPS)n5g-F^aoFVMr<CTT X+JO>hF%JU+1IP|0Mm`oUYY|2Ol_cy$ diff --git a/preprocess/analysedonnees.ipynb b/preprocess/analysedonnees.ipynb index 7b5e57e..59d3a0e 100644 --- a/preprocess/analysedonnees.ipynb +++ b/preprocess/analysedonnees.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -321,26 +321,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "count 35010.000000\n", + "mean 638.102228\n", + "std 219.443182\n", + "min 50.000000\n", + "25% 481.000000\n", + "50% 611.000000\n", + "75% 765.000000\n", + "max 1796.000000\n", + "Name: ledd, dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X['ledd'].describe()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "unsupported operand type(s) for /: 'list' and 'int'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgene\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue_counts\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'list' and 'int'" - ] + "data": { + "text/plain": [ + "gene\n", + "No Mutation 0.320324\n", + "LRRK2+ 0.167815\n", + "GBA+ 0.145622\n", + "OTHER+ 0.043271\n", + "Name: count, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -349,10 +374,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df_gr=X.groupby('patient_id')['ledd'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df_non=df_gr[df_gr==0]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nombre de patients n'ayant pas de valeurs pour ledd : 808\n", + "Nombre de patients au total : 6971\n" + ] + } + ], + "source": [ + "print(\"Nombre de patients n'ayant pas de valeurs pour ledd : \", df_non.shape[0])\n", + "print(\"Nombre de patients au total : \", df_gr.shape[0])" + ] } ], "metadata": { diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py index c94f954..0a9380c 100644 --- a/preprocess/preprocess4.py +++ b/preprocess/preprocess4.py @@ -1,29 +1,43 @@ import pandas as pd import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_absolute_error, r2_score ''' fichier pour centraliser toutes les transformations de données ''' class PreprocessData: def __init__(self,path_X,path_y): - self.X = pd.read_csv(path_X) + self.X=pd.read_csv(path_X) + self.y = pd.read_csv(path_y) - print("coucou") - self.virer_patient() - print("coucou 2") + + + self.enlever_index() self.remplir_gene() - print("Coucou 3") + #self.virer_patient() + self.encoder_cohort() + self.imputer_age_at_diagnosis() - def valeurs_off(self): - pass - - def remplir_gene(self): - pass - - def virer_patient(self): + def enlever_index(self): + self.X.drop('Index',axis=1,inplace=True) + + def encoder_cohort(self): + self.X['cohort']=self.X['cohort'].apply(lambda x:1 if x=='A' else 0) + + def encoder_patient(self): + X_patient=self.X['patient_id'] + vectorizer = CountVectorizer() + X_patient=vectorizer.fit_transform(X_patient) + X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out()) + self.X=pd.concat([self.X,X_patient],axis=1) self.X.drop('patient_id',axis=1,inplace=True) + return self.X + def remplir_gene(self): X_list=self.X['gene'].tolist() @@ -55,15 +69,78 @@ class PreprocessData: X_list[i]='Inconnu' self.X['gene']=X_list - valeurs=['LRRK2+','No Mutation','GBA+','OTHER+','Inconnu'] self.X['est_LRRK2+']=self.X['gene'].apply(lambda x: f_l(x)) self.X['est_GBA+']=self.X['gene'].apply(lambda x: f_g(x)) self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x)) self.X.drop('gene',axis=1,inplace=True) return self.X + def virer_patient(self): + self.X.drop('patient_id',axis=1,inplace=True) def get_X(self): return self.X + def get_y(self): + return self.y + def get_data(self): + return self.X,self.y + def rajout_feature_temps(self): + ''' + Pour capturer la relation temporelle + ''' + pass + def imputer_age_at_diagnosis(self): + """ + Impute les valeurs manquantes de age_at_diagnosis en utilisant une régression linéaire. + """ -preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv') -print(preprocess4.get_X().head(30)) \ No newline at end of file + patients_values = self.X.dropna(subset=['age_at_diagnosis']).groupby('patient_id')['age_at_diagnosis'].first().reset_index() + patients_values.columns = ['patient_id', 'known_value'] + + temp_df = self.X.merge(patients_values, on='patient_id', how='left') + + mask = temp_df['age_at_diagnosis'].isna() & temp_df['known_value'].notna() + self.X.loc[mask.values, 'age_at_diagnosis'] = temp_df.loc[mask, 'known_value'].values + + patients_sans_diagnostic = self.X.groupby('patient_id')['age_at_diagnosis'].apply( + lambda x: x.isna().all()) + patients_sans_diagnostic = patients_sans_diagnostic[patients_sans_diagnostic].index.tolist() + + nb_patients_sans_diagnostic = len(patients_sans_diagnostic) + total_patients = len(self.X['patient_id'].unique()) + pourcentage = (nb_patients_sans_diagnostic / total_patients) * 100 + + + if nb_patients_sans_diagnostic > 0: + patients_avec_diagnostic = ~self.X['patient_id'].isin(patients_sans_diagnostic) + df_known = self.X[patients_avec_diagnostic].dropna(subset=['age_at_diagnosis']) + df_known_unique = df_known.drop_duplicates('patient_id') + + features = ['age', 'sexM', 'est_LRRK2+', 'est_GBA+', 'est_OTHER+', 'cohort'] + X_known = df_known_unique[features] + y_known = df_known_unique['age_at_diagnosis'] + + X_train, X_test, y_train, y_test = train_test_split( + X_known, y_known, test_size=0.2, random_state=42) + + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + mae = mean_absolute_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) + print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}, R²: {r2:.2f}") + + df_unknown = self.X[self.X['patient_id'].isin(patients_sans_diagnostic)].drop_duplicates('patient_id') + X_unknown = df_unknown[features] + predicted_ages = model.predict(X_unknown) + + for i, patient_id in enumerate(df_unknown['patient_id']): + self.X.loc[self.X['patient_id'] == patient_id, 'age_at_diagnosis'] = predicted_ages[i] + + missing_pct = self.X['age_at_diagnosis'].isna().mean() * 100 + print(f"Pourcentage de valeurs manquantes après imputation: {missing_pct:.2f}%") + self.X['disease_duration'] = self.X['age'] - self.X['age_at_diagnosis'] + print("Variable 'disease_duration' ajoutée avec succès.") + + return self.X + diff --git a/preprocess/valeursoff.ipynb b/preprocess/valeursoff.ipynb index 6858993..5d983ab 100644 --- a/preprocess/valeursoff.ipynb +++ b/preprocess/valeursoff.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,8 @@ "import seaborn as sns\n", "\n", "X = pd.read_csv('data/X_train_6ZIKlTY.csv')\n", - "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n" + "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n", + "X_chall=pd.read_csv('data/X_test_oiZ2ukx.csv')\n" ] }, { @@ -162,53 +163,137 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>patient_id</th>\n", + " <th>cohort</th>\n", + " <th>sexM</th>\n", + " <th>gene</th>\n", + " <th>age_at_diagnosis</th>\n", + " <th>age</th>\n", + " <th>ledd</th>\n", + " <th>time_since_intake_on</th>\n", + " <th>time_since_intake_off</th>\n", + " <th>on</th>\n", + " <th>off</th>\n", + " <th>time_since_diagnosis</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], "text/plain": [ - "23407" + "Empty DataFrame\n", + "Columns: [Index, patient_id, cohort, sexM, gene, age_at_diagnosis, age, ledd, time_since_intake_on, time_since_intake_off, on, off, time_since_diagnosis]\n", + "Index: []" ] }, - "execution_count": 4, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X['off'].isna().sum()" + "v=X_chall[X_chall['off'].isna()]\n", + "v=v[v['on'].isna()]\n", + "v.head()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "from preprocess import PreprocessData\n", - "\n", - "preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')" + "from preprocess4 import PreprocessData\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 45, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'PreprocessData' object has no attribute 'get_X'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpreprocess4\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_X\u001b[49m()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'PreprocessData' object has no attribute 'get_X'" + "name": "stdout", + "output_type": "stream", + "text": [ + "coucou\n", + "coucou 2\n", + "Coucou 3\n" ] } ], - "source": [] + "source": [ + "x_2=PreprocessData('data/X_train_6ZIKlTY.csv','data/X_train_6ZIKlTY.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index 0.000000\n", + "cohort 0.000000\n", + "sexM 0.000000\n", + "age_at_diagnosis 0.050897\n", + "age 0.000000\n", + "ledd 0.370358\n", + "time_since_intake_on 0.466522\n", + "time_since_intake_off 0.788231\n", + "on 0.299606\n", + "off 0.420966\n", + "est_LRRK2+ 0.000000\n", + "est_GBA+ 0.000000\n", + "est_OTHER+ 0.000000\n", + "dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_2.get_X().isna().mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_2.groupby()" + ] } ], "metadata": { -- GitLab