{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 清洗和预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = pd.read_csv(\"data/drinks.csv\")\n",
    "drinks.head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Max & Min\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "117    376\n",
       "45     361\n",
       "62     347\n",
       "65     346\n",
       "98     343\n",
       "135    343\n",
       "188    333\n",
       "81     313\n",
       "129    306\n",
       "140    297\n",
       "Name: beer_servings, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"beer_servings\"].nlargest(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "117"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"beer_servings\"].idxmax()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa           53\n",
       "Europe           45\n",
       "Asia             44\n",
       "North America    23\n",
       "Oceania          16\n",
       "South America    12\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"continent\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa           0.274611\n",
       "Europe           0.233161\n",
       "Asia             0.227979\n",
       "North America    0.119171\n",
       "Oceania          0.082902\n",
       "South America    0.062176\n",
       "Name: proportion, dtype: float64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"continent\"].value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',\n",
       "       'South America'],\n",
       "      dtype='object', name='continent')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# access the Series index\n",
    "drinks[\"continent\"].value_counts().index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<NumpyExtensionArray>\n",
       "[np.int64(53), np.int64(45), np.int64(44), np.int64(23), np.int64(16),\n",
       " np.int64(12)]\n",
       "Length: 6, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# access the Series values\n",
    "drinks[\"continent\"].value_counts().array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "South America    12\n",
       "Oceania          16\n",
       "North America    23\n",
       "Asia             44\n",
       "Europe           45\n",
       "Africa           53\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# any Series can be sorted by its values\n",
    "drinks.continent.value_counts().sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa           53\n",
       "Asia             44\n",
       "Europe           45\n",
       "North America    23\n",
       "Oceania          16\n",
       "South America    12\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# any Series can also be sorted by its index\n",
    "drinks[\"continent\"].value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Asia', 'Europe', 'Africa', 'North America', 'South America',\n",
       "       'Oceania'], dtype=object)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"continent\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"continent\"].nunique()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>The Shawshank Redemption</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.2</td>\n",
       "      <td>The Godfather</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>175</td>\n",
       "      <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.1</td>\n",
       "      <td>The Godfather: Part II</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>200</td>\n",
       "      <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>The Dark Knight</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Action</td>\n",
       "      <td>152</td>\n",
       "      <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Pulp Fiction</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>154</td>\n",
       "      <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   star_rating                     title content_rating   genre  duration  \\\n",
       "0          9.3  The Shawshank Redemption              R   Crime       142   \n",
       "1          9.2             The Godfather              R   Crime       175   \n",
       "2          9.1    The Godfather: Part II              R   Crime       200   \n",
       "3          9.0           The Dark Knight          PG-13  Action       152   \n",
       "4          8.9              Pulp Fiction              R   Crime       154   \n",
       "\n",
       "                                         actors_list  \n",
       "0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  \n",
       "1    [u'Marlon Brando', u'Al Pacino', u'James Caan']  \n",
       "2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  \n",
       "3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...  \n",
       "4  [u'John Travolta', u'Uma Thurman', u'Samuel L....  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies = pd.read_csv(\"data/imdb_1000.csv\")\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "542     (500) Days of Summer\n",
       "5               12 Angry Men\n",
       "201         12 Years a Slave\n",
       "698                127 Hours\n",
       "110    2001: A Space Odyssey\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the 'title' Series in ascending order (returns a Series)\n",
    "movies.title.sort_values().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "864               [Rec]\n",
       "526                Zulu\n",
       "615          Zombieland\n",
       "677              Zodiac\n",
       "955    Zero Dark Thirty\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort in descending order instead\n",
    "movies.title.sort_values(ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>542</th>\n",
       "      <td>7.8</td>\n",
       "      <td>(500) Days of Summer</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>95</td>\n",
       "      <td>[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>8.9</td>\n",
       "      <td>12 Angry Men</td>\n",
       "      <td>NOT RATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>96</td>\n",
       "      <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201</th>\n",
       "      <td>8.1</td>\n",
       "      <td>12 Years a Slave</td>\n",
       "      <td>R</td>\n",
       "      <td>Biography</td>\n",
       "      <td>134</td>\n",
       "      <td>[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>7.6</td>\n",
       "      <td>127 Hours</td>\n",
       "      <td>R</td>\n",
       "      <td>Adventure</td>\n",
       "      <td>94</td>\n",
       "      <td>[u'James Franco', u'Amber Tamblyn', u'Kate Mara']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>8.3</td>\n",
       "      <td>2001: A Space Odyssey</td>\n",
       "      <td>G</td>\n",
       "      <td>Mystery</td>\n",
       "      <td>160</td>\n",
       "      <td>[u'Keir Dullea', u'Gary Lockwood', u'William S...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating                  title content_rating      genre  duration  \\\n",
       "542          7.8   (500) Days of Summer          PG-13     Comedy        95   \n",
       "5            8.9           12 Angry Men      NOT RATED      Drama        96   \n",
       "201          8.1       12 Years a Slave              R  Biography       134   \n",
       "698          7.6              127 Hours              R  Adventure        94   \n",
       "110          8.3  2001: A Space Odyssey              G    Mystery       160   \n",
       "\n",
       "                                           actors_list  \n",
       "542  [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...  \n",
       "5    [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...  \n",
       "201  [u'Chiwetel Ejiofor', u'Michael Kenneth Willia...  \n",
       "698  [u'James Franco', u'Amber Tamblyn', u'Kate Mara']  \n",
       "110  [u'Keir Dullea', u'Gary Lockwood', u'William S...  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the entire DataFrame by the 'title' Series (returns a DataFrame)\n",
    "movies.sort_values(\"title\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>864</th>\n",
       "      <td>7.5</td>\n",
       "      <td>[Rec]</td>\n",
       "      <td>R</td>\n",
       "      <td>Horror</td>\n",
       "      <td>78</td>\n",
       "      <td>[u'Manuela Velasco', u'Ferran Terraza', u'Jorg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>526</th>\n",
       "      <td>7.8</td>\n",
       "      <td>Zulu</td>\n",
       "      <td>UNRATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>138</td>\n",
       "      <td>[u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>615</th>\n",
       "      <td>7.7</td>\n",
       "      <td>Zombieland</td>\n",
       "      <td>R</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>88</td>\n",
       "      <td>[u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>677</th>\n",
       "      <td>7.7</td>\n",
       "      <td>Zodiac</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>157</td>\n",
       "      <td>[u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>955</th>\n",
       "      <td>7.4</td>\n",
       "      <td>Zero Dark Thirty</td>\n",
       "      <td>R</td>\n",
       "      <td>Drama</td>\n",
       "      <td>157</td>\n",
       "      <td>[u'Jessica Chastain', u'Joel Edgerton', u'Chri...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating             title content_rating   genre  duration  \\\n",
       "864          7.5             [Rec]              R  Horror        78   \n",
       "526          7.8              Zulu        UNRATED   Drama       138   \n",
       "615          7.7        Zombieland              R  Comedy        88   \n",
       "677          7.7            Zodiac              R   Crime       157   \n",
       "955          7.4  Zero Dark Thirty              R   Drama       157   \n",
       "\n",
       "                                           actors_list  \n",
       "864  [u'Manuela Velasco', u'Ferran Terraza', u'Jorg...  \n",
       "526  [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...  \n",
       "615  [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...  \n",
       "677  [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...  \n",
       "955  [u'Jessica Chastain', u'Joel Edgerton', u'Chri...  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort in descending order instead\n",
    "movies.sort_values(\"title\", ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>713</th>\n",
       "      <td>7.6</td>\n",
       "      <td>The Jungle Book</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Animation</td>\n",
       "      <td>78</td>\n",
       "      <td>[u'Phil Harris', u'Sebastian Cabot', u'Louis P...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>513</th>\n",
       "      <td>7.8</td>\n",
       "      <td>Invasion of the Body Snatchers</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Horror</td>\n",
       "      <td>80</td>\n",
       "      <td>[u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>8.1</td>\n",
       "      <td>The Killing</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Crime</td>\n",
       "      <td>85</td>\n",
       "      <td>[u'Sterling Hayden', u'Coleen Gray', u'Vince E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>703</th>\n",
       "      <td>7.6</td>\n",
       "      <td>Dracula</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Horror</td>\n",
       "      <td>85</td>\n",
       "      <td>[u'Bela Lugosi', u'Helen Chandler', u'David Ma...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>612</th>\n",
       "      <td>7.7</td>\n",
       "      <td>A Hard Day's Night</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>87</td>\n",
       "      <td>[u'John Lennon', u'Paul McCartney', u'George H...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating                           title content_rating      genre  \\\n",
       "713          7.6                 The Jungle Book       APPROVED  Animation   \n",
       "513          7.8  Invasion of the Body Snatchers       APPROVED     Horror   \n",
       "272          8.1                     The Killing       APPROVED      Crime   \n",
       "703          7.6                         Dracula       APPROVED     Horror   \n",
       "612          7.7              A Hard Day's Night       APPROVED     Comedy   \n",
       "\n",
       "     duration                                        actors_list  \n",
       "713        78  [u'Phil Harris', u'Sebastian Cabot', u'Louis P...  \n",
       "513        80  [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...  \n",
       "272        85  [u'Sterling Hayden', u'Coleen Gray', u'Vince E...  \n",
       "703        85  [u'Bela Lugosi', u'Helen Chandler', u'David Ma...  \n",
       "612        87  [u'John Lennon', u'Paul McCartney', u'George H...  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the DataFrame first by 'content_rating', then by 'duration'\n",
    "movies.sort_values([\"content_rating\", \"duration\"]).head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correlation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>continent</th>\n",
       "      <th>Africa</th>\n",
       "      <th>Asia</th>\n",
       "      <th>Europe</th>\n",
       "      <th>North America</th>\n",
       "      <th>Oceania</th>\n",
       "      <th>South America</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Afghanistan</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Albania</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Algeria</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Andorra</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Angola</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Venezuela</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vietnam</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Yemen</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zambia</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zimbabwe</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>193 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "continent    Africa  Asia  Europe  North America  Oceania  South America\n",
       "country                                                                 \n",
       "Afghanistan       0     1       0              0        0              0\n",
       "Albania           0     0       1              0        0              0\n",
       "Algeria           1     0       0              0        0              0\n",
       "Andorra           0     0       1              0        0              0\n",
       "Angola            1     0       0              0        0              0\n",
       "...             ...   ...     ...            ...      ...            ...\n",
       "Venezuela         0     0       0              0        0              1\n",
       "Vietnam           0     1       0              0        0              0\n",
       "Yemen             0     1       0              0        0              0\n",
       "Zambia            1     0       0              0        0              0\n",
       "Zimbabwe          1     0       0              0        0              0\n",
       "\n",
       "[193 rows x 6 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.crosstab(drinks[\"country\"], drinks[\"continent\"])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transform\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Group\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = pd.read_csv(\"data/drinks.csv\")\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Belarus</td>\n",
       "      <td>142</td>\n",
       "      <td>373</td>\n",
       "      <td>42</td>\n",
       "      <td>14.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Burundi</td>\n",
       "      <td>88</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6.3</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>Cambodia</td>\n",
       "      <td>57</td>\n",
       "      <td>65</td>\n",
       "      <td>1</td>\n",
       "      <td>2.2</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>Costa Rica</td>\n",
       "      <td>149</td>\n",
       "      <td>87</td>\n",
       "      <td>11</td>\n",
       "      <td>4.4</td>\n",
       "      <td>North America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>Ecuador</td>\n",
       "      <td>162</td>\n",
       "      <td>74</td>\n",
       "      <td>3</td>\n",
       "      <td>4.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>Micronesia</td>\n",
       "      <td>62</td>\n",
       "      <td>50</td>\n",
       "      <td>18</td>\n",
       "      <td>2.3</td>\n",
       "      <td>Oceania</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        country  beer_servings  spirit_servings  wine_servings  \\\n",
       "15      Belarus            142              373             42   \n",
       "27      Burundi             88                0              0   \n",
       "30     Cambodia             57               65              1   \n",
       "41   Costa Rica            149               87             11   \n",
       "52      Ecuador            162               74              3   \n",
       "110  Micronesia             62               50             18   \n",
       "\n",
       "     total_litres_of_pure_alcohol      continent  \n",
       "15                           14.4         Europe  \n",
       "27                            6.3         Africa  \n",
       "30                            2.2           Asia  \n",
       "41                            4.4  North America  \n",
       "52                            4.2  South America  \n",
       "110                           2.3        Oceania  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks.groupby(\"continent\").nth(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa            61.471698\n",
       "Asia              37.045455\n",
       "Europe           193.777778\n",
       "North America    145.434783\n",
       "Oceania           89.687500\n",
       "South America    175.083333\n",
       "Name: beer_servings, dtype: float64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the mean beer servings for each continent\n",
    "drinks.groupby(\"continent\")[\"beer_servings\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa           376\n",
       "Asia             247\n",
       "Europe           361\n",
       "North America    285\n",
       "Oceania          306\n",
       "South America    333\n",
       "Name: beer_servings, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# other aggregation functions (such as 'max') can also be used with groupby\n",
    "drinks.groupby(\"continent\")[\"beer_servings\"].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>min</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>continent</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Africa</th>\n",
       "      <td>53</td>\n",
       "      <td>61.471698</td>\n",
       "      <td>0</td>\n",
       "      <td>376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Asia</th>\n",
       "      <td>44</td>\n",
       "      <td>37.045455</td>\n",
       "      <td>0</td>\n",
       "      <td>247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Europe</th>\n",
       "      <td>45</td>\n",
       "      <td>193.777778</td>\n",
       "      <td>0</td>\n",
       "      <td>361</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>North America</th>\n",
       "      <td>23</td>\n",
       "      <td>145.434783</td>\n",
       "      <td>1</td>\n",
       "      <td>285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Oceania</th>\n",
       "      <td>16</td>\n",
       "      <td>89.687500</td>\n",
       "      <td>0</td>\n",
       "      <td>306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>South America</th>\n",
       "      <td>12</td>\n",
       "      <td>175.083333</td>\n",
       "      <td>93</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               count        mean  min  max\n",
       "continent                                 \n",
       "Africa            53   61.471698    0  376\n",
       "Asia              44   37.045455    0  247\n",
       "Europe            45  193.777778    0  361\n",
       "North America     23  145.434783    1  285\n",
       "Oceania           16   89.687500    0  306\n",
       "South America     12  175.083333   93  333"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# multiple aggregation functions can be applied simultaneously\n",
    "drinks.groupby(\"continent\")[\"beer_servings\"].agg([\"count\", \"mean\", \"min\", \"max\"])"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "      <th>average_wine_servings</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "      <td>9.068182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "      <td>142.222222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "      <td>16.264151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "      <td>142.222222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "      <td>16.264151</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  average_wine_servings  \n",
       "0                           0.0      Asia               9.068182  \n",
       "1                           4.9    Europe             142.222222  \n",
       "2                           0.7    Africa              16.264151  \n",
       "3                          12.4    Europe             142.222222  \n",
       "4                           5.9    Africa              16.264151  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"average_wine_servings\"] = drinks.groupby(\"continent\")[\n",
    "    \"wine_servings\"\n",
    "].transform(\"mean\")\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "      <th>average_wine_servings</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "      <td>0.090682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "      <td>1.422222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "      <td>0.162642</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "      <td>1.422222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "      <td>0.162642</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  average_wine_servings  \n",
       "0                           0.0      Asia               0.090682  \n",
       "1                           4.9    Europe               1.422222  \n",
       "2                           0.7    Africa               0.162642  \n",
       "3                          12.4    Europe               1.422222  \n",
       "4                           5.9    Africa               0.162642  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"average_wine_servings\"] = drinks[\"average_wine_servings\"].apply(\n",
    "    lambda x: x / 100\n",
    ")\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width species\n",
       "0           5.1          3.5           1.4          0.2  setosa\n",
       "1           4.9          3.0           1.4          0.2  setosa\n",
       "2           4.7          3.2           1.3          0.2  setosa\n",
       "3           4.6          3.1           1.5          0.2  setosa\n",
       "4           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris = pd.read_csv(\"data/iris.csv\")\n",
    "iris.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "species\n",
       "setosa        50\n",
       "versicolor    50\n",
       "virginica     50\n",
       "dtype: int64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.groupby(\"species\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>species</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>setosa</th>\n",
       "      <td>14</td>\n",
       "      <td>15</td>\n",
       "      <td>24</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>versicolor</th>\n",
       "      <td>50</td>\n",
       "      <td>85</td>\n",
       "      <td>83</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>virginica</th>\n",
       "      <td>131</td>\n",
       "      <td>117</td>\n",
       "      <td>118</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            sepal_length  sepal_width  petal_length  petal_width\n",
       "species                                                         \n",
       "setosa                14           15            24           43\n",
       "versicolor            50           85            83           70\n",
       "virginica            131          117           118          100"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.groupby(\"species\").idxmax()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_min</th>\n",
       "      <th>sepal_max</th>\n",
       "      <th>petal_mean</th>\n",
       "      <th>petal_std</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>species</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>setosa</th>\n",
       "      <td>4.3</td>\n",
       "      <td>5.8</td>\n",
       "      <td>1.462</td>\n",
       "      <td>0.173664</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>versicolor</th>\n",
       "      <td>4.9</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.260</td>\n",
       "      <td>0.469911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>virginica</th>\n",
       "      <td>4.9</td>\n",
       "      <td>7.9</td>\n",
       "      <td>5.552</td>\n",
       "      <td>0.551895</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            sepal_min  sepal_max  petal_mean  petal_std\n",
       "species                                                \n",
       "setosa            4.3        5.8       1.462   0.173664\n",
       "versicolor        4.9        7.0       4.260   0.469911\n",
       "virginica         4.9        7.9       5.552   0.551895"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.groupby(\"species\").agg(\n",
    "    sepal_min=(\"sepal_length\", \"min\"),\n",
    "    sepal_max=(\"sepal_length\", \"max\"),\n",
    "    petal_mean=(\"petal_length\", \"mean\"),\n",
    "    petal_std=(\"petal_length\", \"std\"),\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply a function\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"data/titanic_train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex</th>\n",
       "      <th>Sex_num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>female</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>female</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>female</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Sex  Sex_num\n",
       "0    male        1\n",
       "1  female        0\n",
       "2  female        0\n",
       "3  female        0\n",
       "4    male        1"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# map 'female' to 0 and 'male' to 1\n",
    "train[\"Sex_num\"] = train.Sex.map({\"female\": 0, \"male\": 1})\n",
    "train.loc[0:4, [\"Sex\", \"Sex_num\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Name_length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Name  Name_length\n",
       "0                            Braund, Mr. Owen Harris           23\n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...           51\n",
       "2                             Heikkinen, Miss. Laina           22\n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)           44\n",
       "4                           Allen, Mr. William Henry           24"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the length of each string in the 'Name' Series\n",
    "train[\"Name_length\"] = train[\"Name\"].apply(len)\n",
    "train.loc[0:4, [\"Name\", \"Name_length\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                           [Braund,  Mr. Owen Harris]\n",
       "1    [Cumings,  Mrs. John Bradley (Florence Briggs ...\n",
       "2                            [Heikkinen,  Miss. Laina]\n",
       "3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]\n",
       "4                          [Allen,  Mr. William Henry]\n",
       "Name: Name, dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use a string method to split the 'Name' Series at commas (returns a Series of lists)\n",
    "train[\"Name\"].str.split(\",\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       Braund\n",
       "1      Cumings\n",
       "2    Heikkinen\n",
       "3     Futrelle\n",
       "4        Allen\n",
       "Name: Name, dtype: object"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  use a lambda function\n",
    "train[\"Name\"].str.split(\",\").apply(lambda x: x[0]).head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### MultiIndex\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "      <td>CSCO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "      <td>AAPL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "      <td>MSFT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "      <td>AAPL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "      <td>MSFT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "      <td>CSCO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "      <td>MSFT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "      <td>CSCO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "      <td>AAPL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Date   Close    Volume Symbol\n",
       "0  2016-10-03   31.50  14070500   CSCO\n",
       "1  2016-10-03  112.52  21701800   AAPL\n",
       "2  2016-10-03   57.42  19189500   MSFT\n",
       "3  2016-10-04  113.00  29736800   AAPL\n",
       "4  2016-10-04   57.24  20085900   MSFT\n",
       "5  2016-10-04   31.35  18460400   CSCO\n",
       "6  2016-10-05   57.64  16726400   MSFT\n",
       "7  2016-10-05   31.59  11808600   CSCO\n",
       "8  2016-10-05  113.05  21453100   AAPL"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks = pd.read_csv(\"data/stocks.csv\")\n",
    "stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=9, step=1)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Symbol  Date      \n",
       "AAPL    2016-10-03    112.52\n",
       "        2016-10-04    113.00\n",
       "        2016-10-05    113.05\n",
       "CSCO    2016-10-03     31.50\n",
       "        2016-10-04     31.35\n",
       "        2016-10-05     31.59\n",
       "MSFT    2016-10-03     57.42\n",
       "        2016-10-04     57.24\n",
       "        2016-10-05     57.64\n",
       "Name: Close, dtype: float64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser = stocks.groupby([\"Symbol\", \"Date\"])[\"Close\"].mean()\n",
    "ser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultiIndex([('AAPL', '2016-10-03'),\n",
       "            ('AAPL', '2016-10-04'),\n",
       "            ('AAPL', '2016-10-05'),\n",
       "            ('CSCO', '2016-10-03'),\n",
       "            ('CSCO', '2016-10-04'),\n",
       "            ('CSCO', '2016-10-05'),\n",
       "            ('MSFT', '2016-10-03'),\n",
       "            ('MSFT', '2016-10-04'),\n",
       "            ('MSFT', '2016-10-05')],\n",
       "           names=['Symbol', 'Date'])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Symbol  Date      \n",
       "AAPL    2016-10-03    112.52\n",
       "        2016-10-04    113.00\n",
       "        2016-10-05    113.05\n",
       "CSCO    2016-10-03     31.50\n",
       "        2016-10-04     31.35\n",
       "        2016-10-05     31.59\n",
       "MSFT    2016-10-03     57.42\n",
       "        2016-10-04     57.24\n",
       "        2016-10-05     57.64\n",
       "Name: Close, dtype: float64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"3\" halign=\"left\">Close</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Date</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <th>2016-10-05</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <td>112.52</td>\n",
       "      <td>113.00</td>\n",
       "      <td>113.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <td>31.50</td>\n",
       "      <td>31.35</td>\n",
       "      <td>31.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <td>57.42</td>\n",
       "      <td>57.24</td>\n",
       "      <td>57.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Close                      \n",
       "Date   2016-10-03 2016-10-04 2016-10-05\n",
       "Symbol                                 \n",
       "AAPL       112.52     113.00     113.05\n",
       "CSCO        31.50      31.35      31.59\n",
       "MSFT        57.42      57.24      57.64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser.to_frame().pivot_table(index=\"Symbol\", columns=\"Date\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "AAPL   2016-10-04  113.00  29736800\n",
       "MSFT   2016-10-04   57.24  20085900\n",
       "CSCO   2016-10-04   31.35  18460400\n",
       "MSFT   2016-10-05   57.64  16726400\n",
       "CSCO   2016-10-05   31.59  11808600\n",
       "AAPL   2016-10-05  113.05  21453100"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks = stocks.set_index([\"Symbol\", \"Date\"])\n",
    "stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultiIndex([('CSCO', '2016-10-03'),\n",
       "            ('AAPL', '2016-10-03'),\n",
       "            ('MSFT', '2016-10-03'),\n",
       "            ('AAPL', '2016-10-04'),\n",
       "            ('MSFT', '2016-10-04'),\n",
       "            ('CSCO', '2016-10-04'),\n",
       "            ('MSFT', '2016-10-05'),\n",
       "            ('CSCO', '2016-10-05'),\n",
       "            ('AAPL', '2016-10-05')],\n",
       "           names=['Symbol', 'Date'])"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "       2016-10-05  113.05  21453100\n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "       2016-10-04   31.35  18460400\n",
       "       2016-10-05   31.59  11808600\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900\n",
       "       2016-10-05   57.64  16726400"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks = stocks.sort_index()\n",
    "stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Close          112.52\n",
       "Volume    21701800.00\n",
       "Name: (AAPL, 2016-10-03), dtype: float64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[(\"AAPL\", \"2016-10-03\"), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(112.52)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[(\"AAPL\", \"2016-10-03\"), \"Close\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "       2016-10-05  113.05  21453100\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900\n",
       "       2016-10-05   57.64  16726400"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[[\"AAPL\", \"MSFT\"], :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "MSFT   2016-10-03   57.42  19189500"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[([\"AAPL\", \"MSFT\"], \"2016-10-03\"), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Symbol  Date      \n",
       "AAPL    2016-10-03    112.52\n",
       "MSFT    2016-10-03     57.42\n",
       "Name: Close, dtype: float64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[([\"AAPL\", \"MSFT\"], \"2016-10-03\"), \"Close\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "       2016-10-04   31.35  18460400\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[(slice(None), [\"2016-10-03\", \"2016-10-04\"]), :]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pivot\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Date</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <th>2016-10-05</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <td>112.52</td>\n",
       "      <td>113.00</td>\n",
       "      <td>113.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <td>31.50</td>\n",
       "      <td>31.35</td>\n",
       "      <td>31.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <td>57.42</td>\n",
       "      <td>57.24</td>\n",
       "      <td>57.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Date    2016-10-03  2016-10-04  2016-10-05\n",
       "Symbol                                    \n",
       "AAPL        112.52      113.00      113.05\n",
       "CSCO         31.50       31.35       31.59\n",
       "MSFT         57.42       57.24       57.64"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks = stocks.pivot_table(values=\"Close\", index=\"Symbol\", columns=\"Date\")\n",
    "stocks"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clean\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove duplicate rows\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>85711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>53</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>94043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23</td>\n",
       "      <td>M</td>\n",
       "      <td>writer</td>\n",
       "      <td>32067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>43537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>33</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>15213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender  occupation zip_code\n",
       "user_id                                 \n",
       "1         24      M  technician    85711\n",
       "2         53      F       other    94043\n",
       "3         23      M      writer    32067\n",
       "4         24      M  technician    43537\n",
       "5         33      F       other    15213"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "user_cols = [\"user_id\", \"age\", \"gender\", \"occupation\", \"zip_code\"]\n",
    "users = pd.read_csv(\n",
    "    \"data/movie.user\", sep=\"|\", header=None, names=user_cols, index_col=\"user_id\"\n",
    ")\n",
    "users.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(943, 4)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id\n",
       "939    False\n",
       "940     True\n",
       "941    False\n",
       "942    False\n",
       "943    False\n",
       "Name: zip_code, dtype: bool"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# detect duplicate zip codes: True if an item is identical to a previous item\n",
    "users[\"zip_code\"].duplicated().tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.int64(148)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count the duplicate items (True becomes 1, False becomes 0)\n",
    "users[\"zip_code\"].duplicated().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>621</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>805</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zip_code\n",
       "user_id                                \n",
       "496       21      F    student    55414\n",
       "572       51      M   educator    20003\n",
       "621       17      M    student    60402\n",
       "684       28      M    student    55414\n",
       "733       44      F      other    60630\n",
       "805       27      F      other    20009\n",
       "890       32      M    student    97301"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the duplicate rows (ignoring the first occurrence)\n",
    "users.loc[users.duplicated(keep=\"first\"), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>428</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>437</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>460</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zip_code\n",
       "user_id                                \n",
       "67        17      M    student    60402\n",
       "85        51      M   educator    20003\n",
       "198       21      F    student    55414\n",
       "350       32      M    student    97301\n",
       "428       28      M    student    55414\n",
       "437       27      F      other    20009\n",
       "460       44      F      other    60630"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the duplicate rows (ignoring the last occurrence)\n",
    "users.loc[users.duplicated(keep=\"last\"), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>428</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>437</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>460</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>621</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>805</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zip_code\n",
       "user_id                                \n",
       "67        17      M    student    60402\n",
       "85        51      M   educator    20003\n",
       "198       21      F    student    55414\n",
       "350       32      M    student    97301\n",
       "428       28      M    student    55414\n",
       "437       27      F      other    20009\n",
       "460       44      F      other    60630\n",
       "496       21      F    student    55414\n",
       "572       51      M   educator    20003\n",
       "621       17      M    student    60402\n",
       "684       28      M    student    55414\n",
       "733       44      F      other    60630\n",
       "805       27      F      other    20009\n",
       "890       32      M    student    97301"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the duplicate rows (including all duplicates)\n",
    "users.loc[users.duplicated(keep=False), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(936, 4)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop the duplicate rows (inplace=False by default)\n",
    "users.drop_duplicates(keep=\"first\").shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(936, 4)"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.drop_duplicates(keep=\"last\").shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(929, 4)"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.drop_duplicates(keep=False).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.int64(16)"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# only consider a subset of columns when identifying duplicates\n",
    "users.duplicated(subset=[\"age\", \"zip_code\"]).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(927, 4)"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.drop_duplicates(subset=[\"age\", \"zip_code\"]).shape"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Detect missing values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>Grant Park</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>Spirit Lake</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IA</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>Ybor</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>FL</td>\n",
       "      <td>12/31/2000 23:59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              City Colors Reported Shape Reported State              Time\n",
       "18236   Grant Park             NaN       TRIANGLE    IL  12/31/2000 23:00\n",
       "18237  Spirit Lake             NaN           DISK    IA  12/31/2000 23:00\n",
       "18238  Eagle River             NaN            NaN    WI  12/31/2000 23:45\n",
       "18239  Eagle River             RED          LIGHT    WI  12/31/2000 23:45\n",
       "18240         Ybor             NaN           OVAL    FL  12/31/2000 23:59"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo = pd.read_csv(\"data/ufo.csv\")\n",
    "ufo.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        City  Colors Reported  Shape Reported  State   Time\n",
       "18236  False             True           False  False  False\n",
       "18237  False             True           False  False  False\n",
       "18238  False             True            True  False  False\n",
       "18239  False            False           False  False  False\n",
       "18240  False             True           False  False  False"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.isna().tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       City  Colors Reported  Shape Reported  State  Time\n",
       "18236  True            False            True   True  True\n",
       "18237  True            False            True   True  True\n",
       "18238  True            False           False   True  True\n",
       "18239  True             True            True   True  True\n",
       "18240  True            False            True   True  True"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.notna().tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City                  26\n",
       "Colors Reported    15359\n",
       "Shape Reported      2644\n",
       "State                  0\n",
       "Time                   0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count the number of missing values in each Series\n",
    "ufo.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LA</td>\n",
       "      <td>8/15/1943 0:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>LA</td>\n",
       "      <td>8/15/1943 0:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>CA</td>\n",
       "      <td>7/15/1952 12:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241</th>\n",
       "      <td>NaN</td>\n",
       "      <td>BLUE</td>\n",
       "      <td>DISK</td>\n",
       "      <td>MT</td>\n",
       "      <td>7/4/1953 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>613</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>NV</td>\n",
       "      <td>7/1/1960 12:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    City Colors Reported Shape Reported State             Time\n",
       "21   NaN             NaN            NaN    LA   8/15/1943 0:00\n",
       "22   NaN             NaN          LIGHT    LA   8/15/1943 0:00\n",
       "204  NaN             NaN           DISK    CA  7/15/1952 12:30\n",
       "241  NaN            BLUE           DISK    MT   7/4/1953 14:00\n",
       "613  NaN             NaN           DISK    NV   7/1/1960 12:00"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'isna' Series method to filter the DataFrame rows\n",
    "ufo.loc[ufo.City.isna()].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18241, 5)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the number of rows and columns\n",
    "ufo.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2486, 5)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if 'any' values are missing in a row, then drop that row\n",
    "ufo.dropna(how=\"any\").shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18241, 5)"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily\n",
    "ufo.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18241, 5)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if 'all' values are missing in a row, then drop that row (none are dropped in this case)\n",
    "ufo.dropna(how=\"all\").shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15575, 5)"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row\n",
    "ufo.dropna(subset=[\"City\", \"Shape Reported\"], how=\"any\").shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Shape Reported\n",
       "LIGHT       2803\n",
       "DISK        2122\n",
       "TRIANGLE    1889\n",
       "OTHER       1402\n",
       "CIRCLE      1365\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'value_counts' does not include missing values by default\n",
    "ufo[\"Shape Reported\"].value_counts().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Shape Reported\n",
       "LIGHT       2803\n",
       "NaN         2644\n",
       "DISK        2122\n",
       "TRIANGLE    1889\n",
       "OTHER       1402\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# explicitly include missing values\n",
    "ufo[\"Shape Reported\"].value_counts(dropna=False).head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fill missing values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill in missing values with a specified value\n",
    "ufo[\"Shape Reported\"] = ufo[\"Shape Reported\"].fillna(value=\"VARIOUS\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Shape Reported\n",
       "VARIOUS     2977\n",
       "LIGHT       2803\n",
       "DISK        2122\n",
       "TRIANGLE    1889\n",
       "OTHER       1402\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# confirm that the missing values were filled in\n",
    "ufo[\"Shape Reported\"].value_counts().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca         UNKNOWN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro         UNKNOWN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke         UNKNOWN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene         UNKNOWN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair         UNKNOWN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fill in missing values\n",
    "ufo.fillna(value=\"UNKNOWN\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/py/n14256yd5r5ddms88x9bvsv40000gn/T/ipykernel_27146/278037071.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
      "  ufo.fillna(method=\"bfill\").tail()\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>Grant Park</td>\n",
       "      <td>RED</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>Spirit Lake</td>\n",
       "      <td>RED</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IA</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>VARIOUS</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>Ybor</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>FL</td>\n",
       "      <td>12/31/2000 23:59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              City Colors Reported Shape Reported State              Time\n",
       "18236   Grant Park             RED       TRIANGLE    IL  12/31/2000 23:00\n",
       "18237  Spirit Lake             RED           DISK    IA  12/31/2000 23:00\n",
       "18238  Eagle River             RED        VARIOUS    WI  12/31/2000 23:45\n",
       "18239  Eagle River             RED          LIGHT    WI  12/31/2000 23:45\n",
       "18240         Ybor             NaN           OVAL    FL  12/31/2000 23:59"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fill missing values using \"backward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n",
    "ufo.fillna(method=\"bfill\").tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/py/n14256yd5r5ddms88x9bvsv40000gn/T/ipykernel_27146/2160560122.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n",
      "  ufo.fillna(method=\"ffill\").tail()\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>Grant Park</td>\n",
       "      <td>RED</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>Spirit Lake</td>\n",
       "      <td>RED</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IA</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>VARIOUS</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>Ybor</td>\n",
       "      <td>RED</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>FL</td>\n",
       "      <td>12/31/2000 23:59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              City Colors Reported Shape Reported State              Time\n",
       "18236   Grant Park             RED       TRIANGLE    IL  12/31/2000 23:00\n",
       "18237  Spirit Lake             RED           DISK    IA  12/31/2000 23:00\n",
       "18238  Eagle River             RED        VARIOUS    WI  12/31/2000 23:45\n",
       "18239  Eagle River             RED          LIGHT    WI  12/31/2000 23:45\n",
       "18240         Ybor             RED           OVAL    FL  12/31/2000 23:59"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compare with \"forward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n",
    "ufo.fillna(method=\"ffill\").tail()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Where & Mask\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"data/titanic_train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>Female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>Female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>Female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  Female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  Female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  Female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cond = train[\"Sex\"] == \"male\"\n",
    "# replace the values unsatisfied the condition, default is NaN\n",
    "train[\"Sex\"] = train[\"Sex\"].where(cond, other=\"Female\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>Female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>Female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>Female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  Female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  Female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  Female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked quality  \n",
       "0      0         A/5 21171   7.2500   NaN        S     low  \n",
       "1      0          PC 17599  71.2833   C85        C     low  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S     low  \n",
       "3      0            113803  53.1000  C123        S     low  \n",
       "4      0            373450   8.0500   NaN        S          "
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[\"quality\"] = \"\"\n",
    "cond1 = train[\"Sex\"] == \"male\"\n",
    "cond2 = train[\"Age\"] > 25\n",
    "# display the values unsatisfied the condition\n",
    "train[\"quality\"] = train[\"quality\"].where(cond1 & cond2, other=\"low\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>Female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>Female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>Female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  Female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  Female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  Female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked quality  \n",
       "0      0         A/5 21171   7.2500   NaN        S          \n",
       "1      0          PC 17599  71.2833   C85        C          \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S          \n",
       "3      0            113803  53.1000  C123        S          \n",
       "4      0            373450   8.0500   NaN        S     low  "
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[\"quality\"] = \"\"\n",
    "cond1 = train[\"Sex\"] == \"male\"\n",
    "cond2 = train[\"Age\"] > 25\n",
    "# mask the values unsatisfied the condition\n",
    "train[\"quality\"] = train[\"quality\"].mask(cond1 & cond2, other=\"low\")\n",
    "train.head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Outliers (clip)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       10\n",
       "1       54\n",
       "2       14\n",
       "3      300\n",
       "4       45\n",
       "      ... \n",
       "188     10\n",
       "189     10\n",
       "190     10\n",
       "191     10\n",
       "192     10\n",
       "Name: wine_servings, Length: 193, dtype: int64"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks[\"wine_servings\"].clip(10, 300)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## String & Time\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Change the data type\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = pd.read_csv(\"data/drinks.csv\")\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                          object\n",
       "beer_servings                     int64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "continent                        object\n",
       "dtype: object"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the data type of each Series\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                         string[python]\n",
       "beer_servings                            Int64\n",
       "spirit_servings                          Int64\n",
       "wine_servings                            Int64\n",
       "total_litres_of_pure_alcohol           Float64\n",
       "continent                       string[python]\n",
       "dtype: object"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks.convert_dtypes().dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                          object\n",
       "beer_servings                   float64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "continent                        object\n",
       "dtype: object"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# change the data type of an existing Series\n",
    "drinks[\"beer_servings\"] = drinks[\"beer_servings\"].astype(float)\n",
    "drinks.dtypes"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use string methods\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>order_id</th>\n",
       "      <th>quantity</th>\n",
       "      <th>item_name</th>\n",
       "      <th>choice_description</th>\n",
       "      <th>item_price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Chips and Fresh Tomato Salsa</td>\n",
       "      <td>NaN</td>\n",
       "      <td>$2.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Izze</td>\n",
       "      <td>[Clementine]</td>\n",
       "      <td>$3.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Nantucket Nectar</td>\n",
       "      <td>[Apple]</td>\n",
       "      <td>$3.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Chips and Tomatillo-Green Chili Salsa</td>\n",
       "      <td>NaN</td>\n",
       "      <td>$2.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Tomatillo-Red Chili Salsa (Hot), [Black Beans...</td>\n",
       "      <td>$16.98</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   order_id  quantity                              item_name  \\\n",
       "0         1         1           Chips and Fresh Tomato Salsa   \n",
       "1         1         1                                   Izze   \n",
       "2         1         1                       Nantucket Nectar   \n",
       "3         1         1  Chips and Tomatillo-Green Chili Salsa   \n",
       "4         2         2                           Chicken Bowl   \n",
       "\n",
       "                                  choice_description item_price  \n",
       "0                                                NaN      $2.39  \n",
       "1                                       [Clementine]      $3.39  \n",
       "2                                            [Apple]      $3.39  \n",
       "3                                                NaN      $2.39  \n",
       "4  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...     $16.98  "
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "orders = pd.read_csv(\"data/chipotle.tsv\", sep=\"\\t\")\n",
    "orders.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0             CHIPS AND FRESH TOMATO SALSA\n",
       "1                                     IZZE\n",
       "2                         NANTUCKET NECTAR\n",
       "3    CHIPS AND TOMATILLO-GREEN CHILI SALSA\n",
       "4                             CHICKEN BOWL\n",
       "Name: item_name, dtype: object"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# string methods for pandas Series are accessed via 'str'\n",
    "orders.item_name.str.upper().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>order_id</th>\n",
       "      <th>quantity</th>\n",
       "      <th>item_name</th>\n",
       "      <th>choice_description</th>\n",
       "      <th>item_price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Tomatillo-Red Chili Salsa (Hot), [Black Beans...</td>\n",
       "      <td>$16.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...</td>\n",
       "      <td>$10.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Crispy Tacos</td>\n",
       "      <td>[Roasted Chili Corn Salsa, [Fajita Vegetables,...</td>\n",
       "      <td>$8.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Soft Tacos</td>\n",
       "      <td>[Roasted Chili Corn Salsa, [Rice, Black Beans,...</td>\n",
       "      <td>$8.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...</td>\n",
       "      <td>$11.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    order_id  quantity             item_name  \\\n",
       "4          2         2          Chicken Bowl   \n",
       "5          3         1          Chicken Bowl   \n",
       "11         6         1  Chicken Crispy Tacos   \n",
       "12         6         1    Chicken Soft Tacos   \n",
       "13         7         1          Chicken Bowl   \n",
       "\n",
       "                                   choice_description item_price  \n",
       "4   [Tomatillo-Red Chili Salsa (Hot), [Black Beans...     $16.98  \n",
       "5   [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...     $10.98  \n",
       "11  [Roasted Chili Corn Salsa, [Fajita Vegetables,...      $8.75  \n",
       "12  [Roasted Chili Corn Salsa, [Rice, Black Beans,...      $8.75  \n",
       "13  [Fresh Tomato Salsa, [Fajita Vegetables, Rice,...     $11.25  "
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the boolean Series to filter the DataFrame\n",
    "orders[orders.item_name.str.contains(\"Chicken\")].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                                                  NaN\n",
       "1                                           Clementine\n",
       "2                                                Apple\n",
       "3                                                  NaN\n",
       "4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...\n",
       "Name: choice_description, dtype: object"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# string methods can be chained together\n",
    "orders.choice_description.str.replace(\"[\", \"\").str.replace(\"]\", \"\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                                                  NaN\n",
       "1                                         [Clementine]\n",
       "2                                              [Apple]\n",
       "3                                                  NaN\n",
       "4    [Tomatillo-Red Chili Salsa (Hot), [Black Beans...\n",
       "Name: choice_description, dtype: object"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# many pandas string methods support regular expressions (regex)\n",
    "orders.choice_description.str.replace(r\"[\\[\\]]\", \"\").head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dates & Times\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read a dataset of UFO reports into a DataFrame\n",
    "ufo = pd.read_csv(\"data/ufo.csv\")\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City               object\n",
       "Colors Reported    object\n",
       "Shape Reported     object\n",
       "State              object\n",
       "Time               object\n",
       "dtype: object"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'Time' is currently stored as a string\n",
    "ufo.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>1930-06-01 22:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>1930-06-30 20:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>1931-02-15 14:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>1931-06-01 13:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>1933-04-18 19:00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State  \\\n",
       "0                Ithaca             NaN       TRIANGLE    NY   \n",
       "1           Willingboro             NaN          OTHER    NJ   \n",
       "2               Holyoke             NaN           OVAL    CO   \n",
       "3               Abilene             NaN           DISK    KS   \n",
       "4  New York Worlds Fair             NaN          LIGHT    NY   \n",
       "\n",
       "                 Time  \n",
       "0 1930-06-01 22:00:00  \n",
       "1 1930-06-30 20:00:00  \n",
       "2 1931-02-15 14:00:00  \n",
       "3 1931-06-01 13:00:00  \n",
       "4 1933-04-18 19:00:00  "
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# convert 'Time' to datetime format\n",
    "ufo[\"Time\"] = pd.to_datetime(ufo.Time)\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City                       object\n",
       "Colors Reported            object\n",
       "Shape Reported             object\n",
       "State                      object\n",
       "Time               datetime64[ns]\n",
       "dtype: object"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1930-06-01 22:00:00</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1930-06-30 20:00:00</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1931-02-15 14:00:00</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1931-06-01 13:00:00</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1933-04-18 19:00:00</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     City Colors Reported Shape Reported State\n",
       "Time                                                                          \n",
       "1930-06-01 22:00:00                Ithaca             NaN       TRIANGLE    NY\n",
       "1930-06-30 20:00:00           Willingboro             NaN          OTHER    NJ\n",
       "1931-02-15 14:00:00               Holyoke             NaN           OVAL    CO\n",
       "1931-06-01 13:00:00               Abilene             NaN           DISK    KS\n",
       "1933-04-18 19:00:00  New York Worlds Fair             NaN          LIGHT    NY"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo = ufo.set_index(\"Time\")\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1943-06-01 15:00:00</th>\n",
       "      <td>Nebraska</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>NE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1947-07-15 15:00:00</th>\n",
       "      <td>Clintwood</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CYLINDER</td>\n",
       "      <td>VA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1947-07-15 15:00:00</th>\n",
       "      <td>Hazelton</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>ID</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1949-04-10 15:00:00</th>\n",
       "      <td>Seattle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>WA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1949-06-30 15:00:00</th>\n",
       "      <td>Knoxville</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>TN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-10-12 15:00:00</th>\n",
       "      <td>Mountain City</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>TN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-10-28 15:00:00</th>\n",
       "      <td>Kintnersville</td>\n",
       "      <td>ORANGE</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>PA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-11-20 15:00:00</th>\n",
       "      <td>St. Joseph</td>\n",
       "      <td>NaN</td>\n",
       "      <td>EGG</td>\n",
       "      <td>MO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-10 15:00:00</th>\n",
       "      <td>Marshfield</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-27 15:00:00</th>\n",
       "      <td>Kea'au</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SPHERE</td>\n",
       "      <td>HI</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>254 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                              City Colors Reported Shape Reported State\n",
       "Time                                                                   \n",
       "1943-06-01 15:00:00       Nebraska             NaN           DISK    NE\n",
       "1947-07-15 15:00:00      Clintwood             NaN       CYLINDER    VA\n",
       "1947-07-15 15:00:00       Hazelton             NaN           DISK    ID\n",
       "1949-04-10 15:00:00        Seattle             NaN           DISK    WA\n",
       "1949-06-30 15:00:00      Knoxville             NaN          OTHER    TN\n",
       "...                            ...             ...            ...   ...\n",
       "2000-10-12 15:00:00  Mountain City             NaN           DISK    TN\n",
       "2000-10-28 15:00:00  Kintnersville          ORANGE          OTHER    PA\n",
       "2000-11-20 15:00:00     St. Joseph             NaN            EGG    MO\n",
       "2000-12-10 15:00:00     Marshfield             NaN          LIGHT    WI\n",
       "2000-12-27 15:00:00         Kea'au             NaN         SPHERE    HI\n",
       "\n",
       "[254 rows x 4 columns]"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.at_time(\"15:00\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1941-07-02 11:30:00</th>\n",
       "      <td>Forest Home</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CIRCLE</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1943-10-15 11:00:00</th>\n",
       "      <td>Owensboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RECTANGLE</td>\n",
       "      <td>KY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944-01-01 10:00:00</th>\n",
       "      <td>Wilderness</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>WV</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944-01-01 12:00:00</th>\n",
       "      <td>San Diego</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CIGAR</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1944-01-01 12:00:00</th>\n",
       "      <td>Wilderness</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>WV</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-27 10:00:00</th>\n",
       "      <td>Santa Paula</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-28 11:00:00</th>\n",
       "      <td>Cerrilillo</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-28 12:00:00</th>\n",
       "      <td>Mansfield</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FLASH</td>\n",
       "      <td>TX</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-30 10:00:00</th>\n",
       "      <td>Simi Valley</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FORMATION</td>\n",
       "      <td>CA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-12-31 11:45:00</th>\n",
       "      <td>Chicago</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>781 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            City Colors Reported Shape Reported State\n",
       "Time                                                                 \n",
       "1941-07-02 11:30:00  Forest Home             NaN         CIRCLE    CA\n",
       "1943-10-15 11:00:00    Owensboro             NaN      RECTANGLE    KY\n",
       "1944-01-01 10:00:00   Wilderness             NaN           DISK    WV\n",
       "1944-01-01 12:00:00    San Diego             NaN          CIGAR    CA\n",
       "1944-01-01 12:00:00   Wilderness             NaN           DISK    WV\n",
       "...                          ...             ...            ...   ...\n",
       "2000-12-27 10:00:00  Santa Paula             NaN          LIGHT    CA\n",
       "2000-12-28 11:00:00   Cerrilillo             NaN            NaN    NM\n",
       "2000-12-28 12:00:00    Mansfield             NaN          FLASH    TX\n",
       "2000-12-30 10:00:00  Simi Valley             NaN      FORMATION    CA\n",
       "2000-12-31 11:45:00      Chicago             NaN           DISK    IL\n",
       "\n",
       "[781 rows x 4 columns]"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.between_time(\"09:45\", \"12:00\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Category data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = pd.read_csv(\"data/drinks.csv\")\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                         string[python]\n",
       "beer_servings                            Int64\n",
       "spirit_servings                          Int64\n",
       "wine_servings                            Int64\n",
       "total_litres_of_pure_alcohol           Float64\n",
       "continent                       string[python]\n",
       "dtype: object"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = drinks.convert_dtypes()\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             132\n",
       "country                         11044\n",
       "beer_servings                    1737\n",
       "spirit_servings                  1737\n",
       "wine_servings                    1737\n",
       "total_litres_of_pure_alcohol     1737\n",
       "continent                       10788\n",
       "dtype: int64"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the memory usage for each Series (in bytes)\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                         string[python]\n",
       "beer_servings                            Int64\n",
       "spirit_servings                          Int64\n",
       "wine_servings                            Int64\n",
       "total_litres_of_pure_alcohol           Float64\n",
       "continent                             category\n",
       "dtype: object"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'category' data type to store the 'continent' strings as integers\n",
    "drinks[\"continent\"] = drinks[\"continent\"].astype(\"category\")\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    2\n",
       "2    0\n",
       "3    2\n",
       "4    0\n",
       "dtype: int8"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)\n",
    "drinks[\"continent\"].cat.codes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             132\n",
       "country                         11044\n",
       "beer_servings                    1737\n",
       "spirit_servings                  1737\n",
       "wine_servings                    1737\n",
       "total_litres_of_pure_alcohol     1737\n",
       "continent                         708\n",
       "dtype: int64"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# memory usage has been drastically reduced\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             132\n",
       "country                         15598\n",
       "beer_servings                    1737\n",
       "spirit_servings                  1737\n",
       "wine_servings                    1737\n",
       "total_litres_of_pure_alcohol     1737\n",
       "continent                         708\n",
       "dtype: int64"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# repeat this process for the 'country' Series\n",
    "drinks[\"country\"] = drinks[\"country\"].astype(\"category\")\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n",
       "       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',\n",
       "       ...\n",
       "       'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',\n",
       "       'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],\n",
       "      dtype='string', length=193)"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# memory usage increased because we created 193 categories\n",
    "drinks[\"country\"].cat.categories"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dummy variables\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"data/titanic_train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>female</th>\n",
       "      <th>male</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   female   male\n",
       "0   False   True\n",
       "1    True  False\n",
       "2    True  False\n",
       "3    True  False\n",
       "4   False   True"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use 'get_dummies' to create one column for every possible value\n",
    "pd.get_dummies(train.Sex).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex_male</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sex_male\n",
       "0      True\n",
       "1     False\n",
       "2     False\n",
       "3     False\n",
       "4      True"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop the first dummy variable ('female') using the 'iloc' method\n",
    "# add a prefix to identify the source of the dummy variables\n",
    "pd.get_dummies(train.Sex, prefix=\"Sex\").iloc[:, 1:].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Embarked_C  Embarked_Q  Embarked_S\n",
       "0       False       False        True\n",
       "1        True       False       False\n",
       "2       False       False        True\n",
       "3       False       False        True\n",
       "4       False       False        True\n",
       "5       False        True       False\n",
       "6       False       False        True\n",
       "7       False       False        True\n",
       "8       False       False        True\n",
       "9        True       False       False"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use 'get_dummies' with a feature that has 3 possible values\n",
    "pd.get_dummies(train.Embarked, prefix=\"Embarked\").head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Embarked_Q  Embarked_S\n",
       "0       False        True\n",
       "1       False       False\n",
       "2       False        True\n",
       "3       False        True\n",
       "4       False        True\n",
       "5        True       False\n",
       "6       False        True\n",
       "7       False        True\n",
       "8       False        True\n",
       "9       False       False"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop the first dummy variable ('C')\n",
    "pd.get_dummies(train.Embarked, prefix=\"Embarked\").iloc[:, 1:].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  Embarked_Q  Embarked_S  \n",
       "0      0         A/5 21171   7.2500   NaN        S       False        True  \n",
       "1      0          PC 17599  71.2833   C85        C       False       False  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S       False        True  \n",
       "3      0            113803  53.1000  C123        S       False        True  \n",
       "4      0            373450   8.0500   NaN        S       False        True  "
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# save the DataFrame of dummy variables and concatenate them to the original DataFrame\n",
    "embarked_dummies = pd.get_dummies(train.Embarked, prefix=\"Embarked\").iloc[:, 1:]\n",
    "train = pd.concat([train, embarked_dummies], axis=1)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"data/titanic_train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name   Age  SibSp  Parch  \\\n",
       "0                            Braund, Mr. Owen Harris  22.0      1      0   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   \n",
       "2                             Heikkinen, Miss. Laina  26.0      0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   \n",
       "4                           Allen, Mr. William Henry  35.0      0      0   \n",
       "\n",
       "             Ticket     Fare Cabin  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         A/5 21171   7.2500   NaN       False      True       False   \n",
       "1          PC 17599  71.2833   C85        True     False        True   \n",
       "2  STON/O2. 3101282   7.9250   NaN        True     False       False   \n",
       "3            113803  53.1000  C123        True     False       False   \n",
       "4            373450   8.0500   NaN       False      True       False   \n",
       "\n",
       "   Embarked_Q  Embarked_S  \n",
       "0       False        True  \n",
       "1       False       False  \n",
       "2       False        True  \n",
       "3       False        True  \n",
       "4       False        True  "
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)\n",
    "pd.get_dummies(train, columns=[\"Sex\", \"Embarked\"]).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name   Age  SibSp  Parch  \\\n",
       "0                            Braund, Mr. Owen Harris  22.0      1      0   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   \n",
       "2                             Heikkinen, Miss. Laina  26.0      0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   \n",
       "4                           Allen, Mr. William Henry  35.0      0      0   \n",
       "\n",
       "             Ticket     Fare Cabin  Sex_male  Embarked_Q  Embarked_S  \n",
       "0         A/5 21171   7.2500   NaN      True       False        True  \n",
       "1          PC 17599  71.2833   C85     False       False       False  \n",
       "2  STON/O2. 3101282   7.9250   NaN     False       False        True  \n",
       "3            113803  53.1000  C123     False       False        True  \n",
       "4            373450   8.0500   NaN      True       False        True  "
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature\n",
    "pd.get_dummies(train, columns=[\"Sex\", \"Embarked\"], drop_first=True).head()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cardinal data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "0  100       good\n",
       "1  101  very good\n",
       "2  102       good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a small DataFrame from a dictionary\n",
    "small = pd.DataFrame(\n",
    "    {\"ID\": [100, 101, 102, 103], \"quality\": [\"good\", \"very good\", \"good\", \"excellent\"]}\n",
    ")\n",
    "small.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "3  103  excellent\n",
       "0  100       good\n",
       "2  102       good\n",
       "1  101  very good"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the DataFrame by the 'quality' Series (alphabetical order)\n",
    "small.sort_values(\"quality\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import CategoricalDtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         good\n",
       "1    very good\n",
       "2         good\n",
       "3    excellent\n",
       "Name: quality, dtype: category\n",
       "Categories (3, object): ['good' < 'very good' < 'excellent']"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# define a logical ordering for the categories\n",
    "cats = [\"good\", \"very good\", \"excellent\"]\n",
    "cat_type = CategoricalDtype(categories=cats, ordered=True)\n",
    "small[\"quality\"] = small.quality.astype(cat_type)\n",
    "small.quality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "0  100       good\n",
       "2  102       good\n",
       "1  101  very good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the DataFrame by the 'quality' Series (logical order)\n",
    "small.sort_values(\"quality\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "1  101  very good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# comparison operators work with ordered categories\n",
    "small.loc[small.quality > \"good\", :]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample rows\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1952-09-15 14:00:00</th>\n",
       "      <td>Norridgewock</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>ME</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1998-10-01 21:15:00</th>\n",
       "      <td>Ipava</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-11-13 22:00:00</th>\n",
       "      <td>Ellinwood</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FIREBALL</td>\n",
       "      <td>KS</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             City Colors Reported Shape Reported State\n",
       "Time                                                                  \n",
       "1952-09-15 14:00:00  Norridgewock             NaN           DISK    ME\n",
       "1998-10-01 21:15:00         Ipava             NaN       TRIANGLE    IL\n",
       "2000-11-13 22:00:00     Ellinwood             NaN       FIREBALL    KS"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'random_state' parameter for reproducibility\n",
    "ufo.sample(n=3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sample 75% of the DataFrame's rows without replacement\n",
    "train = ufo.sample(frac=0.75, random_state=99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1933-04-18 19:00:00</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1934-09-15 15:30:00</th>\n",
       "      <td>Valley City</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>ND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1936-10-15 17:00:00</th>\n",
       "      <td>Eklutna</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CIGAR</td>\n",
       "      <td>AK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1939-06-01 20:00:00</th>\n",
       "      <td>Waterloo</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FIREBALL</td>\n",
       "      <td>AL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1939-07-07 02:00:00</th>\n",
       "      <td>Keokuk</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>IA</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     City Colors Reported Shape Reported State\n",
       "Time                                                                          \n",
       "1933-04-18 19:00:00  New York Worlds Fair             NaN          LIGHT    NY\n",
       "1934-09-15 15:30:00           Valley City             NaN           DISK    ND\n",
       "1936-10-15 17:00:00               Eklutna             NaN          CIGAR    AK\n",
       "1939-06-01 20:00:00              Waterloo             NaN       FIREBALL    AL\n",
       "1939-07-07 02:00:00                Keokuk             NaN           OVAL    IA"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# store the remaining 25% of the rows in another DataFrame\n",
    "test = ufo.loc[~ufo.index.isin(train.index), :]\n",
    "test.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dox",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}