diff --git a/preprocess/__pycache__/preprocess4.cpython-310.pyc b/preprocess/__pycache__/preprocess4.cpython-310.pyc index 1f8db05ce3b312eec7248569d0e514d5e63154d7..566ddc7aa698080e4165633ef5f7689976e24a77 100644 Binary files a/preprocess/__pycache__/preprocess4.cpython-310.pyc and b/preprocess/__pycache__/preprocess4.cpython-310.pyc differ diff --git a/preprocess/analysedonnees.ipynb b/preprocess/analysedonnees.ipynb index 7b5e57eec13abdb23eaa7fae51127d633080d97d..59d3a0e471ea66a8cd20cc7e6952e3985ad06b86 100644 --- a/preprocess/analysedonnees.ipynb +++ b/preprocess/analysedonnees.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -321,26 +321,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "count 35010.000000\n", + "mean 638.102228\n", + "std 219.443182\n", + "min 50.000000\n", + "25% 481.000000\n", + "50% 611.000000\n", + "75% 765.000000\n", + "max 1796.000000\n", + "Name: ledd, dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X['ledd'].describe()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "unsupported operand type(s) for /: 'list' and 'int'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mgene\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalue_counts\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtolist\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for /: 'list' and 'int'" - ] + "data": { + "text/plain": [ + "gene\n", + "No Mutation 0.320324\n", + "LRRK2+ 0.167815\n", + "GBA+ 0.145622\n", + "OTHER+ 0.043271\n", + "Name: count, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -349,10 +374,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df_gr=X.groupby('patient_id')['ledd'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df_non=df_gr[df_gr==0]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nombre de patients n'ayant pas de valeurs pour ledd : 808\n", + "Nombre de patients au total : 6971\n" + ] + } + ], + "source": [ + "print(\"Nombre de patients n'ayant pas de valeurs pour ledd : \", df_non.shape[0])\n", + "print(\"Nombre de patients au total : \", df_gr.shape[0])" + ] } ], "metadata": { diff --git a/preprocess/preprocess4.py b/preprocess/preprocess4.py index c94f954edc44301ffcc20c4338eded1eeb25c3d0..0a9380c902c885caa4e15a80495d0aa021deae18 100644 --- a/preprocess/preprocess4.py +++ b/preprocess/preprocess4.py @@ -1,29 +1,43 @@ import pandas as pd import numpy as np +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_absolute_error, r2_score ''' fichier pour centraliser toutes les transformations de données ''' class PreprocessData: def __init__(self,path_X,path_y): - self.X = pd.read_csv(path_X) + self.X=pd.read_csv(path_X) + self.y = pd.read_csv(path_y) - print("coucou") - self.virer_patient() - print("coucou 2") + + + self.enlever_index() self.remplir_gene() - print("Coucou 3") + #self.virer_patient() + self.encoder_cohort() + self.imputer_age_at_diagnosis() - def valeurs_off(self): - pass - - def remplir_gene(self): - pass - - def virer_patient(self): + def enlever_index(self): + self.X.drop('Index',axis=1,inplace=True) + + def encoder_cohort(self): + self.X['cohort']=self.X['cohort'].apply(lambda x:1 if x=='A' else 0) + + def encoder_patient(self): + X_patient=self.X['patient_id'] + vectorizer = CountVectorizer() + X_patient=vectorizer.fit_transform(X_patient) + X_patient=pd.DataFrame(X_patient.toarray(),columns=vectorizer.get_feature_names_out()) + self.X=pd.concat([self.X,X_patient],axis=1) self.X.drop('patient_id',axis=1,inplace=True) + return self.X + def remplir_gene(self): X_list=self.X['gene'].tolist() @@ -55,15 +69,78 @@ class PreprocessData: X_list[i]='Inconnu' self.X['gene']=X_list - valeurs=['LRRK2+','No Mutation','GBA+','OTHER+','Inconnu'] self.X['est_LRRK2+']=self.X['gene'].apply(lambda x: f_l(x)) self.X['est_GBA+']=self.X['gene'].apply(lambda x: f_g(x)) self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x)) self.X.drop('gene',axis=1,inplace=True) return self.X + def virer_patient(self): + self.X.drop('patient_id',axis=1,inplace=True) def get_X(self): return self.X + def get_y(self): + return self.y + def get_data(self): + return self.X,self.y + def rajout_feature_temps(self): + ''' + Pour capturer la relation temporelle + ''' + pass + def imputer_age_at_diagnosis(self): + """ + Impute les valeurs manquantes de age_at_diagnosis en utilisant une régression linéaire. + """ -preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv') -print(preprocess4.get_X().head(30)) \ No newline at end of file + patients_values = self.X.dropna(subset=['age_at_diagnosis']).groupby('patient_id')['age_at_diagnosis'].first().reset_index() + patients_values.columns = ['patient_id', 'known_value'] + + temp_df = self.X.merge(patients_values, on='patient_id', how='left') + + mask = temp_df['age_at_diagnosis'].isna() & temp_df['known_value'].notna() + self.X.loc[mask.values, 'age_at_diagnosis'] = temp_df.loc[mask, 'known_value'].values + + patients_sans_diagnostic = self.X.groupby('patient_id')['age_at_diagnosis'].apply( + lambda x: x.isna().all()) + patients_sans_diagnostic = patients_sans_diagnostic[patients_sans_diagnostic].index.tolist() + + nb_patients_sans_diagnostic = len(patients_sans_diagnostic) + total_patients = len(self.X['patient_id'].unique()) + pourcentage = (nb_patients_sans_diagnostic / total_patients) * 100 + + + if nb_patients_sans_diagnostic > 0: + patients_avec_diagnostic = ~self.X['patient_id'].isin(patients_sans_diagnostic) + df_known = self.X[patients_avec_diagnostic].dropna(subset=['age_at_diagnosis']) + df_known_unique = df_known.drop_duplicates('patient_id') + + features = ['age', 'sexM', 'est_LRRK2+', 'est_GBA+', 'est_OTHER+', 'cohort'] + X_known = df_known_unique[features] + y_known = df_known_unique['age_at_diagnosis'] + + X_train, X_test, y_train, y_test = train_test_split( + X_known, y_known, test_size=0.2, random_state=42) + + model = LinearRegression() + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + mae = mean_absolute_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) + print(f"Performance du modèle d'imputation linéaire - MAE: {mae:.2f}, R²: {r2:.2f}") + + df_unknown = self.X[self.X['patient_id'].isin(patients_sans_diagnostic)].drop_duplicates('patient_id') + X_unknown = df_unknown[features] + predicted_ages = model.predict(X_unknown) + + for i, patient_id in enumerate(df_unknown['patient_id']): + self.X.loc[self.X['patient_id'] == patient_id, 'age_at_diagnosis'] = predicted_ages[i] + + missing_pct = self.X['age_at_diagnosis'].isna().mean() * 100 + print(f"Pourcentage de valeurs manquantes après imputation: {missing_pct:.2f}%") + self.X['disease_duration'] = self.X['age'] - self.X['age_at_diagnosis'] + print("Variable 'disease_duration' ajoutée avec succès.") + + return self.X + diff --git a/preprocess/valeursoff.ipynb b/preprocess/valeursoff.ipynb index 6858993cbb29384665b208ac10e631582675f19b..5d983abf11fb938b16ce918488eec00a01ed429b 100644 --- a/preprocess/valeursoff.ipynb +++ b/preprocess/valeursoff.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,8 @@ "import seaborn as sns\n", "\n", "X = pd.read_csv('data/X_train_6ZIKlTY.csv')\n", - "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n" + "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n", + "X_chall=pd.read_csv('data/X_test_oiZ2ukx.csv')\n" ] }, { @@ -162,53 +163,137 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>patient_id</th>\n", + " <th>cohort</th>\n", + " <th>sexM</th>\n", + " <th>gene</th>\n", + " <th>age_at_diagnosis</th>\n", + " <th>age</th>\n", + " <th>ledd</th>\n", + " <th>time_since_intake_on</th>\n", + " <th>time_since_intake_off</th>\n", + " <th>on</th>\n", + " <th>off</th>\n", + " <th>time_since_diagnosis</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], "text/plain": [ - "23407" + "Empty DataFrame\n", + "Columns: [Index, patient_id, cohort, sexM, gene, age_at_diagnosis, age, ledd, time_since_intake_on, time_since_intake_off, on, off, time_since_diagnosis]\n", + "Index: []" ] }, - "execution_count": 4, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X['off'].isna().sum()" + "v=X_chall[X_chall['off'].isna()]\n", + "v=v[v['on'].isna()]\n", + "v.head()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "from preprocess import PreprocessData\n", - "\n", - "preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')" + "from preprocess4 import PreprocessData\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 45, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'PreprocessData' object has no attribute 'get_X'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpreprocess4\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_X\u001b[49m()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'PreprocessData' object has no attribute 'get_X'" + "name": "stdout", + "output_type": "stream", + "text": [ + "coucou\n", + "coucou 2\n", + "Coucou 3\n" ] } ], - "source": [] + "source": [ + "x_2=PreprocessData('data/X_train_6ZIKlTY.csv','data/X_train_6ZIKlTY.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index 0.000000\n", + "cohort 0.000000\n", + "sexM 0.000000\n", + "age_at_diagnosis 0.050897\n", + "age 0.000000\n", + "ledd 0.370358\n", + "time_since_intake_on 0.466522\n", + "time_since_intake_off 0.788231\n", + "on 0.299606\n", + "off 0.420966\n", + "est_LRRK2+ 0.000000\n", + "est_GBA+ 0.000000\n", + "est_OTHER+ 0.000000\n", + "dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_2.get_X().isna().mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_2.groupby()" + ] } ], "metadata": {