pzfunzapf

63f93193 · Tebboune Amel · 9f09034b · 63f93193 · 63f93193 · 63f93193
Commit 63f93193 authored 3 months ago by Tebboune Amel
--- a/preprocess/__pycache__/preprocess.cpython-310.pyc
+++ b/preprocess/__pycache__/preprocess.cpython-310.pyc
--- a/preprocess/__pycache__/preprocess4.cpython-310.pyc
+++ b/preprocess/__pycache__/preprocess4.cpython-310.pyc
--- a/preprocess/analysedonnees.ipynb
+++ b/preprocess/analysedonnees.ipynb
--- a/preprocess/preprocess4.py
+++ b/preprocess/preprocess4.py
+import pandas as pd
+import numpy as np
+'''
+fichier pour centraliser toutes les transformations de données
+
+'''
+class PreprocessData:
+    def __init__(self,path_X,path_y):
+        self.X = pd.read_csv(path_X)
+        self.y = pd.read_csv(path_y)
+        print("coucou")
+        self.virer_patient()
+        print("coucou 2")
+        self.remplir_gene()
+        print("Coucou 3")
+        
+        
+    
+    def valeurs_off(self):
+        pass
+
+    def remplir_gene(self):
+        pass
+
+    def virer_patient(self):
+        self.X.drop('patient_id',axis=1,inplace=True)
+        
+    def remplir_gene(self):
+        X_list=self.X['gene'].tolist()
+        def f_l(x):
+            if x=='Inconnu':
+                return 0.4705
+            elif x=='LRRK2+':
+                return 1
+            else :
+                return 0
+        def f_g(x):
+            if x=='GBA+':
+                return 1
+            elif x=='Inconnu':
+                return 0.4080
+            else :
+                return 0
+        def f_o(x):
+            if x=='OTHER+':
+                return 1
+            elif x=='Inconnu':
+                return 0.1211
+            else :
+                return 0
+        for i in range(len(X_list)):
+            x=X_list[i]
+            
+            if type(x)==float:
+  
+                X_list[i]='Inconnu'
+        self.X['gene']=X_list
+        valeurs=['LRRK2+','No Mutation','GBA+','OTHER+','Inconnu']
+        self.X['est_LRRK2+']=self.X['gene'].apply(lambda x: f_l(x))
+        self.X['est_GBA+']=self.X['gene'].apply(lambda x: f_g(x))
+        self.X['est_OTHER+']=self.X['gene'].apply(lambda x: f_o(x))
+        self.X.drop('gene',axis=1,inplace=True)
+        return self.X
+    def get_X(self):
+        return self.X
+
+        
+preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')
+print(preprocess4.get_X().head(30))
\ No newline at end of file
--- a/preprocess/remplissageoff.ipynb
+++ b/preprocess/remplissageoff.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/preprocess/valeursoff.ipynb
+++ b/preprocess/valeursoff.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "X = pd.read_csv('data/X_train_6ZIKlTY.csv')\n",
+    "y = pd.read_csv('data/y_train_lXj6X5y.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Index</th>\n",
+       "      <th>patient_id</th>\n",
+       "      <th>cohort</th>\n",
+       "      <th>sexM</th>\n",
+       "      <th>gene</th>\n",
+       "      <th>age_at_diagnosis</th>\n",
+       "      <th>age</th>\n",
+       "      <th>ledd</th>\n",
+       "      <th>time_since_intake_on</th>\n",
+       "      <th>time_since_intake_off</th>\n",
+       "      <th>on</th>\n",
+       "      <th>off</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>IPLP5212</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>LRRK2+</td>\n",
+       "      <td>48.5</td>\n",
+       "      <td>52.1</td>\n",
+       "      <td>607.0</td>\n",
+       "      <td>1.9</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>IPLP5212</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>LRRK2+</td>\n",
+       "      <td>48.5</td>\n",
+       "      <td>53.0</td>\n",
+       "      <td>666.0</td>\n",
+       "      <td>1.9</td>\n",
+       "      <td>17.6</td>\n",
+       "      <td>12.0</td>\n",
+       "      <td>44.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>IPLP5212</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>LRRK2+</td>\n",
+       "      <td>48.5</td>\n",
+       "      <td>53.9</td>\n",
+       "      <td>717.0</td>\n",
+       "      <td>1.2</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>IPLP5212</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>LRRK2+</td>\n",
+       "      <td>48.5</td>\n",
+       "      <td>54.8</td>\n",
+       "      <td>770.0</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>IPLP5212</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>LRRK2+</td>\n",
+       "      <td>48.5</td>\n",
+       "      <td>56.9</td>\n",
+       "      <td>885.0</td>\n",
+       "      <td>0.3</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Index patient_id cohort  sexM    gene  age_at_diagnosis   age   ledd  \\\n",
+       "0      0   IPLP5212      A     0  LRRK2+              48.5  52.1  607.0   \n",
+       "1      1   IPLP5212      A     0  LRRK2+              48.5  53.0  666.0   \n",
+       "2      2   IPLP5212      A     0  LRRK2+              48.5  53.9  717.0   \n",
+       "3      3   IPLP5212      A     0  LRRK2+              48.5  54.8  770.0   \n",
+       "4      4   IPLP5212      A     0  LRRK2+              48.5  56.9  885.0   \n",
+       "\n",
+       "   time_since_intake_on  time_since_intake_off    on   off  \n",
+       "0                   1.9                    NaN   7.0   NaN  \n",
+       "1                   1.9                   17.6  12.0  44.0  \n",
+       "2                   1.2                    NaN   6.0   NaN  \n",
+       "3                   1.5                    NaN  11.0   NaN  \n",
+       "4                   0.3                    NaN  24.0   NaN  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "23407"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X['off'].isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from preprocess import PreprocessData\n",
+    "\n",
+    "preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'PreprocessData' object has no attribute 'get_X'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpreprocess4\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_X\u001b[49m()\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'PreprocessData' object has no attribute 'get_X'"
+     ]
+    }
+   ],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+X = pd.read_csv('data/X_train_6ZIKlTY.csv')
+y = pd.read_csv('data/y_train_lXj6X5y.csv')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+X.head()
+```
+
+%% Output
+
+       Index patient_id cohort  sexM    gene  age_at_diagnosis   age   ledd  \
+    0      0   IPLP5212      A     0  LRRK2+              48.5  52.1  607.0
+    1      1   IPLP5212      A     0  LRRK2+              48.5  53.0  666.0
+    2      2   IPLP5212      A     0  LRRK2+              48.5  53.9  717.0
+    3      3   IPLP5212      A     0  LRRK2+              48.5  54.8  770.0
+    4      4   IPLP5212      A     0  LRRK2+              48.5  56.9  885.0
+    
+       time_since_intake_on  time_since_intake_off    on   off
+    0                   1.9                    NaN   7.0   NaN
+    1                   1.9                   17.6  12.0  44.0
+    2                   1.2                    NaN   6.0   NaN
+    3                   1.5                    NaN  11.0   NaN
+    4                   0.3                    NaN  24.0   NaN
+
+%% Cell type:code id: tags:
+
+``` python
+X['off'].isna().sum()
+```
+
+%% Output
+
+    23407
+
+%% Cell type:code id: tags:
+
+``` python
+from preprocess import PreprocessData
+
+preprocess4 = PreprocessData('data/X_train_6ZIKlTY.csv', 'data/y_train_lXj6X5y.csv')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In[9], line 1
+    ----> 1 preprocess4.get_X()
+    AttributeError: 'PreprocessData' object has no attribute 'get_X'