sib-swiss
diff --git a/‎01_data_manipulation_and_representation.ipynb‎
Lines changed: 104 additions & 33 deletions b/‎01_data_manipulation_and_representation.ipynb‎
Lines changed: 104 additions & 33 deletions
@@ -154,9 +154,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "help(pd.read_table)"
-   ]
+   "source": []
   },
   {
    "cell_type": "markdown",
@@ -268,7 +266,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_table(\"data/titanic_no_header.csv\", sep=\",\", header=None) \n",
+    "df = pd.read_table(\"data/titanic_no_header.csv\", \n",
+    "                   sep=\",\", header=None) \n",
     "df.head(n=3) "
    ]
   },
@@ -288,7 +287,9 @@
    "outputs": [],
    "source": [
     "df = pd.read_table(\"data/titanic_no_header.csv\", sep=\",\", \n",
-    "                   names = [\"name\", \"column2\", \"age\", \"column4\", \"blip\", \"bloop\", \"spam\", \"eggs\"]) \n",
+    "                   names = [\"name\", \"column2\", \n",
+    "                            \"age\", \"column4\", \n",
+    "                            \"blip\", \"bloop\", \"spam\", \"eggs\"]) \n",
     "# As you can see, we can choose our own names, whether they make sense or not.\n",
     "\n",
     "df.head(3) "
@@ -402,6 +403,15 @@
     "df.head(3)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -520,6 +530,15 @@
     "* **`df.shape`** returns a tuple with the numbers of rows and columns: `(row_count, col_count)`."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.shape"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -594,6 +613,15 @@
     "df.head(3)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -722,7 +750,9 @@
    "outputs": [],
    "source": [
     "# for instance, I may want to change the labels a bit here:\n",
-    "df.Pclass= df.Pclass.cat.rename_categories( {1:'I',2:\"II\",3:\"III\"})\n",
+    "df.Pclass= df.Pclass.cat.rename_categories( {1:'I',\n",
+    "                                             2:\"II\",\n",
+    "                                             3:\"III\"})\n",
     "df.Pclass"
    ]
   },
@@ -733,16 +763,6 @@
     "[datetime64](https://pandas.pydata.org/docs/user_guide/timeseries.html) and [category](https://towardsdatascience.com/staying-sane-while-adopting-pandas-categorical-datatypes-78dbd19dcd8a) are fairly specific, and we refer you to the provided links if you want to learn more about them.\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<br>\n",
-    "\n",
-    "### Micro-Exercise:\n",
-    "* Do you see more things which could be changed in the titanic data-set here?\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -823,7 +843,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.loc[ 0:3, : ]    # this selects the first 4 rows."
+    "df.loc[ 0:3 , : ]    # this selects the first 4 rows."
    ]
   },
   {
@@ -832,7 +852,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.iloc[ 0:3, : ]   # this selects the first 3 rows."
+    "df.iloc[ 0:3 , : ]   # this selects the first 3 rows."
    ]
   },
   {
@@ -1144,7 +1164,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_census = pd.read_table(\"data/swiss_census_1880.csv\", sep=\",\")\n",
+    "df_census = pd.read_table(\"data/swiss_census_1880.csv\", \n",
+    "                          sep=\",\")\n",
     "df_census.loc[:5, [\"town name\", \"Total\", \"Male\"]]"
    ]
   },
@@ -1212,7 +1233,7 @@
    "outputs": [],
    "source": [
     "# NA is represented using pd.NA\n",
-    "df.loc[ df.Pclass==3 , 'Fare'] = pd.NA\n",
+    "df.loc[ df.Pclass==\"III\" , 'Fare'] = pd.NA\n",
     "df.head()"
    ]
   },
@@ -1513,7 +1534,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = df.drop(columns='discount')  # use the 'index' argument to remove rows instead\n",
+    "df.drop(columns='discount' , inplace=True)  # use the 'index' argument to remove rows instead\n",
     "print(\"is 'discount' part of the columns : \" , 'discount' in df.columns)"
    ]
   },
@@ -1546,6 +1567,15 @@
     "## inplace = True : df is changed, df2 is None"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "help( df.drop )"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1606,7 +1636,11 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "m_survived = df.Survived == 1\n",
+    "\n",
+    "df.loc[m_survived , 'Sex'].value_counts()\n"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -1632,7 +1666,7 @@
    "outputs": [],
    "source": [
     "# %load -r 4- solutions/solution_01_01.py\n",
-    "#2. Create a new column Title is the DataFrame representing the title by which passengers should be addressed. "
+    "#2. Create a new column Title is the DataFrame representing the title by which passengers should be addressed. The title can be found in the passenger name and is the only word ending with a '.'"
    ]
   },
   {
@@ -1733,7 +1767,8 @@
     "mask_male = df.Sex == 'male'\n",
     "\n",
     "print('median fare of male', df.Fare[mask_male].median() )\n",
-    "print('median fare of female', df.Fare[~mask_male].median() ) # note the use of ~ to reverse the mask!"
+    "print('median fare of female', df.Fare[~mask_male].median() )\n",
+    "# note the use of ~ to reverse the mask!"
    ]
   },
   {
@@ -1856,7 +1891,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# %load solutions/solution_describe.py"
+    "# %load solutions/solution_describe.py\n",
+    "df.Pclass = df.Pclass.astype('object')\n",
+    "df.Survived = df.Survived.astype('bool')\n",
+    "df.describe()"
    ]
   },
   {
@@ -2015,10 +2053,11 @@
     "axes[0].set_title(\"sine plot\")\n",
     "axes[0].set_xlabel(\"value\")\n",
     "axes[0].set_ylabel(\"sine/cosine value\")\n",
+    "axes[0].legend()\n",
     "\n",
     "## plotting on the second axe == right panel\n",
     "axes[1].plot(x, np.cos(x), label='cos')\n",
-    "axes[1].set_title(\"sine plot\")\n",
+    "axes[1].set_title(\"cosine plot\")\n",
     "axes[1].set_xlabel(\"value\")\n",
     "axes[1].set_ylabel(\"sine/cosine value\")\n",
     "\n",
@@ -2092,7 +2131,8 @@
     "# then you can specify where each plot goes on the figure with the ax argument of the ([0,0] is the top left corner)\n",
     "\n",
     "# Plot a simple histogram with binsize determined automatically\n",
-    "sns.histplot(dfFractions['0-14 y.o.'], kde=False, color=\"b\", ax=axes[0, 0]).set_title('automatic')\n",
+    "sns.histplot(dfFractions['0-14 y.o.'], kde=False, color=\"b\",\n",
+    "             ax=axes[0, 0]).set_title('automatic')\n",
     "\n",
     "# Plot a simple histogram with binsize 5, 10 , 1000\n",
     "sns.histplot(dfFractions['0-14 y.o.'], bins=5   , kde=False, color=\"b\", ax=axes[0, 1]).set_title('5 bins')\n",
@@ -2310,6 +2350,20 @@
     "* Compute survival rates by gender, age category and passenger class."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfc = df.copy()\n",
+    "def age_category(x):\n",
+    "    age_classes = {\"child\": 12, \"teenager\": 17, \"adult\": 64, \"senior\": 200}\n",
+    "    for label, threshold in age_classes.items():\n",
+    "        if x <= threshold:\n",
+    "            return label\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -2399,7 +2453,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.catplot( x='Fare' , y = 'Pclass' , orient='horizontal' ,  data=df , aspect = 2.0 , height = 7)"
+    "sns.catplot( x='Fare' , y = 'Pclass' ,\n",
+    "            orient='horizontal' , \n",
+    "            data=df , aspect = 2.0 , height = 7)"
    ]
   },
   {
@@ -2475,7 +2531,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.catplot( x='Fare' , y = 'Pclass' , hue='Sex' , orient='horizontal' , kind='box', data=df , aspect = 2.0 , height = 7)\n",
+    "sns.catplot( x='Fare' , \n",
+    "            y = 'Pclass' , \n",
+    "            hue='Sex' , \n",
+    "            orient='horizontal' , \n",
+    "            kind='box', data=df , aspect = 2.0 , height = 4)\n",
     "## it is also very nice with kind='point'"
    ]
   },
@@ -2500,13 +2560,22 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "df_census = pd.read_table(\"data/swiss_census_1880.csv\", sep=\",\")\n",
     "df_census.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -2541,7 +2610,7 @@
    "outputs": [],
    "source": [
     "# %load solutions/solution_01_03_fancy.py\n",
-    "# fancy solution inspired by  https://seaborn.pydata.org/examples/kde_ridgeplot.html\n"
+    "# fancy solution inspired by  https://seaborn.pydata.org/examples/kde_ridgeplot.html"
    ]
   },
   {
@@ -2621,7 +2690,9 @@
    "source": [
     "fig = plt.figure(figsize=(15,10))\n",
     "ax = sns.scatterplot( x = 'Fare' , y = 'Age' , \n",
-    "                     hue='Pclass' , palette=['xkcd:tomato','xkcd:teal','xkcd:mustard'],\n",
+    "                     hue='Pclass' , \n",
+    "                     palette=['xkcd:tomato',\n",
+    "                              'xkcd:teal','xkcd:mustard'],\n",
     "                     style='Sex', data=df , s=100 )\n",
     "ax.set(xscale=\"log\")      # setting axis to log scale.\n",
     "ax.set_xlim( (3,1000) )   # manually setting the limit of the x axis.\n"
@@ -2723,7 +2794,7 @@
     "             data=dfFractions , kind = k , orient='h',height=10, aspect=2 )\n",
     "\n",
     "# Save plot to disk, using the savefig() method:\n",
-    "my_plot.savefig(\"output.png\")"
+    "# my_plot.savefig(\"output.png\")"
    ]
   },
   {