{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# 清洗和预处理" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Explore\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0Asia
1Albania89132544.9Europe
2Algeria250140.7Africa
3Andorra24513831212.4Europe
4Angola21757455.9Africa
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv(\"data/drinks.csv\")\n", "drinks.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Max & Min\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "117 376\n", "45 361\n", "62 347\n", "65 346\n", "98 343\n", "135 343\n", "188 333\n", "81 313\n", "129 306\n", "140 297\n", "Name: beer_servings, dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"beer_servings\"].nlargest(10)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "117" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"beer_servings\"].idxmax()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Count\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 53\n", "Europe 45\n", "Asia 44\n", "North America 23\n", "Oceania 16\n", "South America 12\n", "Name: count, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"continent\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 0.274611\n", "Europe 0.233161\n", "Asia 0.227979\n", "North America 0.119171\n", "Oceania 0.082902\n", "South America 0.062176\n", "Name: proportion, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"continent\"].value_counts(normalize=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',\n", " 'South America'],\n", " dtype='object', name='continent')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# access the Series index\n", "drinks[\"continent\"].value_counts().index" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\n", "[np.int64(53), np.int64(45), np.int64(44), np.int64(23), np.int64(16),\n", " np.int64(12)]\n", "Length: 6, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# access the Series values\n", "drinks[\"continent\"].value_counts().array" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "continent\n", "South America 12\n", "Oceania 16\n", "North America 23\n", "Asia 44\n", "Europe 45\n", "Africa 53\n", "Name: count, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# any Series can be sorted by its values\n", "drinks.continent.value_counts().sort_values()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 53\n", "Asia 44\n", "Europe 45\n", "North America 23\n", "Oceania 16\n", "South America 12\n", "Name: count, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# any Series can also be sorted by its index\n", "drinks[\"continent\"].value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Asia', 'Europe', 'Africa', 'North America', 'South America',\n", " 'Oceania'], dtype=object)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"continent\"].unique()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"continent\"].nunique()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Sort\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
star_ratingtitlecontent_ratinggenredurationactors_list
09.3The Shawshank RedemptionRCrime142[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
19.2The GodfatherRCrime175[u'Marlon Brando', u'Al Pacino', u'James Caan']
29.1The Godfather: Part IIRCrime200[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
39.0The Dark KnightPG-13Action152[u'Christian Bale', u'Heath Ledger', u'Aaron E...
48.9Pulp FictionRCrime154[u'John Travolta', u'Uma Thurman', u'Samuel L....
\n", "
" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.read_csv(\"data/imdb_1000.csv\")\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "542 (500) Days of Summer\n", "5 12 Angry Men\n", "201 12 Years a Slave\n", "698 127 Hours\n", "110 2001: A Space Odyssey\n", "Name: title, dtype: object" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the 'title' Series in ascending order (returns a Series)\n", "movies.title.sort_values().head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "864 [Rec]\n", "526 Zulu\n", "615 Zombieland\n", "677 Zodiac\n", "955 Zero Dark Thirty\n", "Name: title, dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort in descending order instead\n", "movies.title.sort_values(ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
star_ratingtitlecontent_ratinggenredurationactors_list
5427.8(500) Days of SummerPG-13Comedy95[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...
58.912 Angry MenNOT RATEDDrama96[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...
2018.112 Years a SlaveRBiography134[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...
6987.6127 HoursRAdventure94[u'James Franco', u'Amber Tamblyn', u'Kate Mara']
1108.32001: A Space OdysseyGMystery160[u'Keir Dullea', u'Gary Lockwood', u'William S...
\n", "
" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "542 7.8 (500) Days of Summer PG-13 Comedy 95 \n", "5 8.9 12 Angry Men NOT RATED Drama 96 \n", "201 8.1 12 Years a Slave R Biography 134 \n", "698 7.6 127 Hours R Adventure 94 \n", "110 8.3 2001: A Space Odyssey G Mystery 160 \n", "\n", " actors_list \n", "542 [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ... \n", "5 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... \n", "201 [u'Chiwetel Ejiofor', u'Michael Kenneth Willia... \n", "698 [u'James Franco', u'Amber Tamblyn', u'Kate Mara'] \n", "110 [u'Keir Dullea', u'Gary Lockwood', u'William S... " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the entire DataFrame by the 'title' Series (returns a DataFrame)\n", "movies.sort_values(\"title\").head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
star_ratingtitlecontent_ratinggenredurationactors_list
8647.5[Rec]RHorror78[u'Manuela Velasco', u'Ferran Terraza', u'Jorg...
5267.8ZuluUNRATEDDrama138[u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...
6157.7ZombielandRComedy88[u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...
6777.7ZodiacRCrime157[u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...
9557.4Zero Dark ThirtyRDrama157[u'Jessica Chastain', u'Joel Edgerton', u'Chri...
\n", "
" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "864 7.5 [Rec] R Horror 78 \n", "526 7.8 Zulu UNRATED Drama 138 \n", "615 7.7 Zombieland R Comedy 88 \n", "677 7.7 Zodiac R Crime 157 \n", "955 7.4 Zero Dark Thirty R Drama 157 \n", "\n", " actors_list \n", "864 [u'Manuela Velasco', u'Ferran Terraza', u'Jorg... \n", "526 [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac... \n", "615 [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha... \n", "677 [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M... \n", "955 [u'Jessica Chastain', u'Joel Edgerton', u'Chri... " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort in descending order instead\n", "movies.sort_values(\"title\", ascending=False).head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
star_ratingtitlecontent_ratinggenredurationactors_list
7137.6The Jungle BookAPPROVEDAnimation78[u'Phil Harris', u'Sebastian Cabot', u'Louis P...
5137.8Invasion of the Body SnatchersAPPROVEDHorror80[u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...
2728.1The KillingAPPROVEDCrime85[u'Sterling Hayden', u'Coleen Gray', u'Vince E...
7037.6DraculaAPPROVEDHorror85[u'Bela Lugosi', u'Helen Chandler', u'David Ma...
6127.7A Hard Day's NightAPPROVEDComedy87[u'John Lennon', u'Paul McCartney', u'George H...
\n", "
" ], "text/plain": [ " star_rating title content_rating genre \\\n", "713 7.6 The Jungle Book APPROVED Animation \n", "513 7.8 Invasion of the Body Snatchers APPROVED Horror \n", "272 8.1 The Killing APPROVED Crime \n", "703 7.6 Dracula APPROVED Horror \n", "612 7.7 A Hard Day's Night APPROVED Comedy \n", "\n", " duration actors_list \n", "713 78 [u'Phil Harris', u'Sebastian Cabot', u'Louis P... \n", "513 80 [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga... \n", "272 85 [u'Sterling Hayden', u'Coleen Gray', u'Vince E... \n", "703 85 [u'Bela Lugosi', u'Helen Chandler', u'David Ma... \n", "612 87 [u'John Lennon', u'Paul McCartney', u'George H... " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the DataFrame first by 'content_rating', then by 'duration'\n", "movies.sort_values([\"content_rating\", \"duration\"]).head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Correlation\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
continentAfricaAsiaEuropeNorth AmericaOceaniaSouth America
country
Afghanistan010000
Albania001000
Algeria100000
Andorra001000
Angola100000
.....................
Venezuela000001
Vietnam010000
Yemen010000
Zambia100000
Zimbabwe100000
\n", "

193 rows × 6 columns

\n", "
" ], "text/plain": [ "continent Africa Asia Europe North America Oceania South America\n", "country \n", "Afghanistan 0 1 0 0 0 0\n", "Albania 0 0 1 0 0 0\n", "Algeria 1 0 0 0 0 0\n", "Andorra 0 0 1 0 0 0\n", "Angola 1 0 0 0 0 0\n", "... ... ... ... ... ... ...\n", "Venezuela 0 0 0 0 0 1\n", "Vietnam 0 1 0 0 0 0\n", "Yemen 0 1 0 0 0 0\n", "Zambia 1 0 0 0 0 0\n", "Zimbabwe 1 0 0 0 0 0\n", "\n", "[193 rows x 6 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.crosstab(drinks[\"country\"], drinks[\"continent\"])" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Transform\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Group\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0Asia
1Albania89132544.9Europe
2Algeria250140.7Africa
3Andorra24513831212.4Europe
4Angola21757455.9Africa
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv(\"data/drinks.csv\")\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
15Belarus1423734214.4Europe
27Burundi88006.3Africa
30Cambodia576512.2Asia
41Costa Rica14987114.4North America
52Ecuador1627434.2South America
110Micronesia6250182.3Oceania
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "15 Belarus 142 373 42 \n", "27 Burundi 88 0 0 \n", "30 Cambodia 57 65 1 \n", "41 Costa Rica 149 87 11 \n", "52 Ecuador 162 74 3 \n", "110 Micronesia 62 50 18 \n", "\n", " total_litres_of_pure_alcohol continent \n", "15 14.4 Europe \n", "27 6.3 Africa \n", "30 2.2 Asia \n", "41 4.4 North America \n", "52 4.2 South America \n", "110 2.3 Oceania " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks.groupby(\"continent\").nth(5)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 61.471698\n", "Asia 37.045455\n", "Europe 193.777778\n", "North America 145.434783\n", "Oceania 89.687500\n", "South America 175.083333\n", "Name: beer_servings, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate the mean beer servings for each continent\n", "drinks.groupby(\"continent\")[\"beer_servings\"].mean()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 376\n", "Asia 247\n", "Europe 361\n", "North America 285\n", "Oceania 306\n", "South America 333\n", "Name: beer_servings, dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# other aggregation functions (such as 'max') can also be used with groupby\n", "drinks.groupby(\"continent\")[\"beer_servings\"].max()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanminmax
continent
Africa5361.4716980376
Asia4437.0454550247
Europe45193.7777780361
North America23145.4347831285
Oceania1689.6875000306
South America12175.08333393333
\n", "
" ], "text/plain": [ " count mean min max\n", "continent \n", "Africa 53 61.471698 0 376\n", "Asia 44 37.045455 0 247\n", "Europe 45 193.777778 0 361\n", "North America 23 145.434783 1 285\n", "Oceania 16 89.687500 0 306\n", "South America 12 175.083333 93 333" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# multiple aggregation functions can be applied simultaneously\n", "drinks.groupby(\"continent\")[\"beer_servings\"].agg([\"count\", \"mean\", \"min\", \"max\"])" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Apply\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinentaverage_wine_servings
0Afghanistan0000.0Asia9.068182
1Albania89132544.9Europe142.222222
2Algeria250140.7Africa16.264151
3Andorra24513831212.4Europe142.222222
4Angola21757455.9Africa16.264151
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent average_wine_servings \n", "0 0.0 Asia 9.068182 \n", "1 4.9 Europe 142.222222 \n", "2 0.7 Africa 16.264151 \n", "3 12.4 Europe 142.222222 \n", "4 5.9 Africa 16.264151 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"average_wine_servings\"] = drinks.groupby(\"continent\")[\n", " \"wine_servings\"\n", "].transform(\"mean\")\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinentaverage_wine_servings
0Afghanistan0000.0Asia0.090682
1Albania89132544.9Europe1.422222
2Algeria250140.7Africa0.162642
3Andorra24513831212.4Europe1.422222
4Angola21757455.9Africa0.162642
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent average_wine_servings \n", "0 0.0 Asia 0.090682 \n", "1 4.9 Europe 1.422222 \n", "2 0.7 Africa 0.162642 \n", "3 12.4 Europe 1.422222 \n", "4 5.9 Africa 0.162642 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"average_wine_servings\"] = drinks[\"average_wine_servings\"].apply(\n", " lambda x: x / 100\n", ")\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris = pd.read_csv(\"data/iris.csv\")\n", "iris.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "species\n", "setosa 50\n", "versicolor 50\n", "virginica 50\n", "dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"species\").size()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
species
setosa14152443
versicolor50858370
virginica131117118100
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "species \n", "setosa 14 15 24 43\n", "versicolor 50 85 83 70\n", "virginica 131 117 118 100" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"species\").idxmax()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_minsepal_maxpetal_meanpetal_std
species
setosa4.35.81.4620.173664
versicolor4.97.04.2600.469911
virginica4.97.95.5520.551895
\n", "
" ], "text/plain": [ " sepal_min sepal_max petal_mean petal_std\n", "species \n", "setosa 4.3 5.8 1.462 0.173664\n", "versicolor 4.9 7.0 4.260 0.469911\n", "virginica 4.9 7.9 5.552 0.551895" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.groupby(\"species\").agg(\n", " sepal_min=(\"sepal_length\", \"min\"),\n", " sepal_max=(\"sepal_length\", \"max\"),\n", " petal_mean=(\"petal_length\", \"mean\"),\n", " petal_std=(\"petal_length\", \"std\"),\n", ")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Apply a function\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(\"data/titanic_train.csv\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SexSex_num
0male1
1female0
2female0
3female0
4male1
\n", "
" ], "text/plain": [ " Sex Sex_num\n", "0 male 1\n", "1 female 0\n", "2 female 0\n", "3 female 0\n", "4 male 1" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# map 'female' to 0 and 'male' to 1\n", "train[\"Sex_num\"] = train.Sex.map({\"female\": 0, \"male\": 1})\n", "train.loc[0:4, [\"Sex\", \"Sex_num\"]]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameName_length
0Braund, Mr. Owen Harris23
1Cumings, Mrs. John Bradley (Florence Briggs Th...51
2Heikkinen, Miss. Laina22
3Futrelle, Mrs. Jacques Heath (Lily May Peel)44
4Allen, Mr. William Henry24
\n", "
" ], "text/plain": [ " Name Name_length\n", "0 Braund, Mr. Owen Harris 23\n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 51\n", "2 Heikkinen, Miss. Laina 22\n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 44\n", "4 Allen, Mr. William Henry 24" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate the length of each string in the 'Name' Series\n", "train[\"Name_length\"] = train[\"Name\"].apply(len)\n", "train.loc[0:4, [\"Name\", \"Name_length\"]]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 [Braund, Mr. Owen Harris]\n", "1 [Cumings, Mrs. John Bradley (Florence Briggs ...\n", "2 [Heikkinen, Miss. Laina]\n", "3 [Futrelle, Mrs. Jacques Heath (Lily May Peel)]\n", "4 [Allen, Mr. William Henry]\n", "Name: Name, dtype: object" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use a string method to split the 'Name' Series at commas (returns a Series of lists)\n", "train[\"Name\"].str.split(\",\").head()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Braund\n", "1 Cumings\n", "2 Heikkinen\n", "3 Futrelle\n", "4 Allen\n", "Name: Name, dtype: object" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use a lambda function\n", "train[\"Name\"].str.split(\",\").apply(lambda x: x[0]).head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### MultiIndex\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateCloseVolumeSymbol
02016-10-0331.5014070500CSCO
12016-10-03112.5221701800AAPL
22016-10-0357.4219189500MSFT
32016-10-04113.0029736800AAPL
42016-10-0457.2420085900MSFT
52016-10-0431.3518460400CSCO
62016-10-0557.6416726400MSFT
72016-10-0531.5911808600CSCO
82016-10-05113.0521453100AAPL
\n", "
" ], "text/plain": [ " Date Close Volume Symbol\n", "0 2016-10-03 31.50 14070500 CSCO\n", "1 2016-10-03 112.52 21701800 AAPL\n", "2 2016-10-03 57.42 19189500 MSFT\n", "3 2016-10-04 113.00 29736800 AAPL\n", "4 2016-10-04 57.24 20085900 MSFT\n", "5 2016-10-04 31.35 18460400 CSCO\n", "6 2016-10-05 57.64 16726400 MSFT\n", "7 2016-10-05 31.59 11808600 CSCO\n", "8 2016-10-05 113.05 21453100 AAPL" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks = pd.read_csv(\"data/stocks.csv\")\n", "stocks" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RangeIndex(start=0, stop=9, step=1)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.index" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Symbol Date \n", "AAPL 2016-10-03 112.52\n", " 2016-10-04 113.00\n", " 2016-10-05 113.05\n", "CSCO 2016-10-03 31.50\n", " 2016-10-04 31.35\n", " 2016-10-05 31.59\n", "MSFT 2016-10-03 57.42\n", " 2016-10-04 57.24\n", " 2016-10-05 57.64\n", "Name: Close, dtype: float64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser = stocks.groupby([\"Symbol\", \"Date\"])[\"Close\"].mean()\n", "ser" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('AAPL', '2016-10-03'),\n", " ('AAPL', '2016-10-04'),\n", " ('AAPL', '2016-10-05'),\n", " ('CSCO', '2016-10-03'),\n", " ('CSCO', '2016-10-04'),\n", " ('CSCO', '2016-10-05'),\n", " ('MSFT', '2016-10-03'),\n", " ('MSFT', '2016-10-04'),\n", " ('MSFT', '2016-10-05')],\n", " names=['Symbol', 'Date'])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser.index" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Symbol Date \n", "AAPL 2016-10-03 112.52\n", " 2016-10-04 113.00\n", " 2016-10-05 113.05\n", "CSCO 2016-10-03 31.50\n", " 2016-10-04 31.35\n", " 2016-10-05 31.59\n", "MSFT 2016-10-03 57.42\n", " 2016-10-04 57.24\n", " 2016-10-05 57.64\n", "Name: Close, dtype: float64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Close
Date2016-10-032016-10-042016-10-05
Symbol
AAPL112.52113.00113.05
CSCO31.5031.3531.59
MSFT57.4257.2457.64
\n", "
" ], "text/plain": [ " Close \n", "Date 2016-10-03 2016-10-04 2016-10-05\n", "Symbol \n", "AAPL 112.52 113.00 113.05\n", "CSCO 31.50 31.35 31.59\n", "MSFT 57.42 57.24 57.64" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser.to_frame().pivot_table(index=\"Symbol\", columns=\"Date\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CloseVolume
SymbolDate
CSCO2016-10-0331.5014070500
AAPL2016-10-03112.5221701800
MSFT2016-10-0357.4219189500
AAPL2016-10-04113.0029736800
MSFT2016-10-0457.2420085900
CSCO2016-10-0431.3518460400
MSFT2016-10-0557.6416726400
CSCO2016-10-0531.5911808600
AAPL2016-10-05113.0521453100
\n", "
" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "CSCO 2016-10-03 31.50 14070500\n", "AAPL 2016-10-03 112.52 21701800\n", "MSFT 2016-10-03 57.42 19189500\n", "AAPL 2016-10-04 113.00 29736800\n", "MSFT 2016-10-04 57.24 20085900\n", "CSCO 2016-10-04 31.35 18460400\n", "MSFT 2016-10-05 57.64 16726400\n", "CSCO 2016-10-05 31.59 11808600\n", "AAPL 2016-10-05 113.05 21453100" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks = stocks.set_index([\"Symbol\", \"Date\"])\n", "stocks" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('CSCO', '2016-10-03'),\n", " ('AAPL', '2016-10-03'),\n", " ('MSFT', '2016-10-03'),\n", " ('AAPL', '2016-10-04'),\n", " ('MSFT', '2016-10-04'),\n", " ('CSCO', '2016-10-04'),\n", " ('MSFT', '2016-10-05'),\n", " ('CSCO', '2016-10-05'),\n", " ('AAPL', '2016-10-05')],\n", " names=['Symbol', 'Date'])" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.index" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CloseVolume
SymbolDate
AAPL2016-10-03112.5221701800
2016-10-04113.0029736800
2016-10-05113.0521453100
CSCO2016-10-0331.5014070500
2016-10-0431.3518460400
2016-10-0531.5911808600
MSFT2016-10-0357.4219189500
2016-10-0457.2420085900
2016-10-0557.6416726400
\n", "
" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", " 2016-10-05 113.05 21453100\n", "CSCO 2016-10-03 31.50 14070500\n", " 2016-10-04 31.35 18460400\n", " 2016-10-05 31.59 11808600\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900\n", " 2016-10-05 57.64 16726400" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks = stocks.sort_index()\n", "stocks" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Close 112.52\n", "Volume 21701800.00\n", "Name: (AAPL, 2016-10-03), dtype: float64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[(\"AAPL\", \"2016-10-03\"), :]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.float64(112.52)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[(\"AAPL\", \"2016-10-03\"), \"Close\"]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CloseVolume
SymbolDate
AAPL2016-10-03112.5221701800
2016-10-04113.0029736800
2016-10-05113.0521453100
MSFT2016-10-0357.4219189500
2016-10-0457.2420085900
2016-10-0557.6416726400
\n", "
" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", " 2016-10-05 113.05 21453100\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900\n", " 2016-10-05 57.64 16726400" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[[\"AAPL\", \"MSFT\"], :]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CloseVolume
SymbolDate
AAPL2016-10-03112.5221701800
MSFT2016-10-0357.4219189500
\n", "
" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", "MSFT 2016-10-03 57.42 19189500" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[([\"AAPL\", \"MSFT\"], \"2016-10-03\"), :]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Symbol Date \n", "AAPL 2016-10-03 112.52\n", "MSFT 2016-10-03 57.42\n", "Name: Close, dtype: float64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[([\"AAPL\", \"MSFT\"], \"2016-10-03\"), \"Close\"]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CloseVolume
SymbolDate
AAPL2016-10-03112.5221701800
2016-10-04113.0029736800
CSCO2016-10-0331.5014070500
2016-10-0431.3518460400
MSFT2016-10-0357.4219189500
2016-10-0457.2420085900
\n", "
" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", "CSCO 2016-10-03 31.50 14070500\n", " 2016-10-04 31.35 18460400\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[(slice(None), [\"2016-10-03\", \"2016-10-04\"]), :]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Pivot\n" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Date2016-10-032016-10-042016-10-05
Symbol
AAPL112.52113.00113.05
CSCO31.5031.3531.59
MSFT57.4257.2457.64
\n", "
" ], "text/plain": [ "Date 2016-10-03 2016-10-04 2016-10-05\n", "Symbol \n", "AAPL 112.52 113.00 113.05\n", "CSCO 31.50 31.35 31.59\n", "MSFT 57.42 57.24 57.64" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks = stocks.pivot_table(values=\"Close\", index=\"Symbol\", columns=\"Date\")\n", "stocks" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Clean\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Remove duplicate rows\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzip_code
user_id
124Mtechnician85711
253Fother94043
323Mwriter32067
424Mtechnician43537
533Fother15213
\n", "
" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "1 24 M technician 85711\n", "2 53 F other 94043\n", "3 23 M writer 32067\n", "4 24 M technician 43537\n", "5 33 F other 15213" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_cols = [\"user_id\", \"age\", \"gender\", \"occupation\", \"zip_code\"]\n", "users = pd.read_csv(\n", " \"data/movie.user\", sep=\"|\", header=None, names=user_cols, index_col=\"user_id\"\n", ")\n", "users.head()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(943, 4)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.shape" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "user_id\n", "939 False\n", "940 True\n", "941 False\n", "942 False\n", "943 False\n", "Name: zip_code, dtype: bool" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# detect duplicate zip codes: True if an item is identical to a previous item\n", "users[\"zip_code\"].duplicated().tail()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.int64(148)" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count the duplicate items (True becomes 1, False becomes 0)\n", "users[\"zip_code\"].duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzip_code
user_id
49621Fstudent55414
57251Meducator20003
62117Mstudent60402
68428Mstudent55414
73344Fother60630
80527Fother20009
89032Mstudent97301
\n", "
" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the duplicate rows (ignoring the first occurrence)\n", "users.loc[users.duplicated(keep=\"first\"), :]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzip_code
user_id
6717Mstudent60402
8551Meducator20003
19821Fstudent55414
35032Mstudent97301
42828Mstudent55414
43727Fother20009
46044Fother60630
\n", "
" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "67 17 M student 60402\n", "85 51 M educator 20003\n", "198 21 F student 55414\n", "350 32 M student 97301\n", "428 28 M student 55414\n", "437 27 F other 20009\n", "460 44 F other 60630" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the duplicate rows (ignoring the last occurrence)\n", "users.loc[users.duplicated(keep=\"last\"), :]" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzip_code
user_id
6717Mstudent60402
8551Meducator20003
19821Fstudent55414
35032Mstudent97301
42828Mstudent55414
43727Fother20009
46044Fother60630
49621Fstudent55414
57251Meducator20003
62117Mstudent60402
68428Mstudent55414
73344Fother60630
80527Fother20009
89032Mstudent97301
\n", "
" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "67 17 M student 60402\n", "85 51 M educator 20003\n", "198 21 F student 55414\n", "350 32 M student 97301\n", "428 28 M student 55414\n", "437 27 F other 20009\n", "460 44 F other 60630\n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the duplicate rows (including all duplicates)\n", "users.loc[users.duplicated(keep=False), :]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(936, 4)" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the duplicate rows (inplace=False by default)\n", "users.drop_duplicates(keep=\"first\").shape" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(936, 4)" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.drop_duplicates(keep=\"last\").shape" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(929, 4)" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.drop_duplicates(keep=False).shape" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.int64(16)" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# only consider a subset of columns when identifying duplicates\n", "users.duplicated(subset=[\"age\", \"zip_code\"]).sum()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(927, 4)" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.drop_duplicates(subset=[\"age\", \"zip_code\"]).shape" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Detect missing values\n" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
18236Grant ParkNaNTRIANGLEIL12/31/2000 23:00
18237Spirit LakeNaNDISKIA12/31/2000 23:00
18238Eagle RiverNaNNaNWI12/31/2000 23:45
18239Eagle RiverREDLIGHTWI12/31/2000 23:45
18240YborNaNOVALFL12/31/2000 23:59
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park NaN TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake NaN DISK IA 12/31/2000 23:00\n", "18238 Eagle River NaN NaN WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor NaN OVAL FL 12/31/2000 23:59" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo = pd.read_csv(\"data/ufo.csv\")\n", "ufo.tail()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
18236FalseTrueFalseFalseFalse
18237FalseTrueFalseFalseFalse
18238FalseTrueTrueFalseFalse
18239FalseFalseFalseFalseFalse
18240FalseTrueFalseFalseFalse
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 False True False False False\n", "18237 False True False False False\n", "18238 False True True False False\n", "18239 False False False False False\n", "18240 False True False False False" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.isna().tail()" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
18236TrueFalseTrueTrueTrue
18237TrueFalseTrueTrueTrue
18238TrueFalseFalseTrueTrue
18239TrueTrueTrueTrueTrue
18240TrueFalseTrueTrueTrue
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 True False True True True\n", "18237 True False True True True\n", "18238 True False False True True\n", "18239 True True True True True\n", "18240 True False True True True" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.notna().tail()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "City 26\n", "Colors Reported 15359\n", "Shape Reported 2644\n", "State 0\n", "Time 0\n", "dtype: int64" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count the number of missing values in each Series\n", "ufo.isna().sum()" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
21NaNNaNNaNLA8/15/1943 0:00
22NaNNaNLIGHTLA8/15/1943 0:00
204NaNNaNDISKCA7/15/1952 12:30
241NaNBLUEDISKMT7/4/1953 14:00
613NaNNaNDISKNV7/1/1960 12:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "21 NaN NaN NaN LA 8/15/1943 0:00\n", "22 NaN NaN LIGHT LA 8/15/1943 0:00\n", "204 NaN NaN DISK CA 7/15/1952 12:30\n", "241 NaN BLUE DISK MT 7/4/1953 14:00\n", "613 NaN NaN DISK NV 7/1/1960 12:00" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'isna' Series method to filter the DataFrame rows\n", "ufo.loc[ufo.City.isna()].head()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the number of rows and columns\n", "ufo.shape" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2486, 5)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if 'any' values are missing in a row, then drop that row\n", "ufo.dropna(how=\"any\").shape" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily\n", "ufo.shape" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if 'all' values are missing in a row, then drop that row (none are dropped in this case)\n", "ufo.dropna(how=\"all\").shape" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(15575, 5)" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row\n", "ufo.dropna(subset=[\"City\", \"Shape Reported\"], how=\"any\").shape" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Shape Reported\n", "LIGHT 2803\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "CIRCLE 1365\n", "Name: count, dtype: int64" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'value_counts' does not include missing values by default\n", "ufo[\"Shape Reported\"].value_counts().head()" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Shape Reported\n", "LIGHT 2803\n", "NaN 2644\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "Name: count, dtype: int64" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# explicitly include missing values\n", "ufo[\"Shape Reported\"].value_counts(dropna=False).head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Fill missing values\n" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# fill in missing values with a specified value\n", "ufo[\"Shape Reported\"] = ufo[\"Shape Reported\"].fillna(value=\"VARIOUS\")" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Shape Reported\n", "VARIOUS 2977\n", "LIGHT 2803\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "Name: count, dtype: int64" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# confirm that the missing values were filled in\n", "ufo[\"Shape Reported\"].value_counts().head()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0IthacaUNKNOWNTRIANGLENY6/1/1930 22:00
1WillingboroUNKNOWNOTHERNJ6/30/1930 20:00
2HolyokeUNKNOWNOVALCO2/15/1931 14:00
3AbileneUNKNOWNDISKKS6/1/1931 13:00
4New York Worlds FairUNKNOWNLIGHTNY4/18/1933 19:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca UNKNOWN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro UNKNOWN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke UNKNOWN OVAL CO 2/15/1931 14:00\n", "3 Abilene UNKNOWN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair UNKNOWN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fill in missing values\n", "ufo.fillna(value=\"UNKNOWN\").head()" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/py/n14256yd5r5ddms88x9bvsv40000gn/T/ipykernel_27146/278037071.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n", " ufo.fillna(method=\"bfill\").tail()\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
18236Grant ParkREDTRIANGLEIL12/31/2000 23:00
18237Spirit LakeREDDISKIA12/31/2000 23:00
18238Eagle RiverREDVARIOUSWI12/31/2000 23:45
18239Eagle RiverREDLIGHTWI12/31/2000 23:45
18240YborNaNOVALFL12/31/2000 23:59
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park RED TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake RED DISK IA 12/31/2000 23:00\n", "18238 Eagle River RED VARIOUS WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor NaN OVAL FL 12/31/2000 23:59" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fill missing values using \"backward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n", "ufo.fillna(method=\"bfill\").tail()" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/py/n14256yd5r5ddms88x9bvsv40000gn/T/ipykernel_27146/2160560122.py:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n", " ufo.fillna(method=\"ffill\").tail()\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
18236Grant ParkREDTRIANGLEIL12/31/2000 23:00
18237Spirit LakeREDDISKIA12/31/2000 23:00
18238Eagle RiverREDVARIOUSWI12/31/2000 23:45
18239Eagle RiverREDLIGHTWI12/31/2000 23:45
18240YborREDOVALFL12/31/2000 23:59
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park RED TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake RED DISK IA 12/31/2000 23:00\n", "18238 Eagle River RED VARIOUS WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor RED OVAL FL 12/31/2000 23:59" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# compare with \"forward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n", "ufo.fillna(method=\"ffill\").tail()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Where & Mask\n" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(\"data/titanic_train.csv\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...Female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. LainaFemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)Female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... Female 38.0 1 \n", "2 Heikkinen, Miss. Laina Female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) Female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cond = train[\"Sex\"] == \"male\"\n", "# replace the values unsatisfied the condition, default is NaN\n", "train[\"Sex\"] = train[\"Sex\"].where(cond, other=\"Female\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedquality
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNSlow
1211Cumings, Mrs. John Bradley (Florence Briggs Th...Female38.010PC 1759971.2833C85Clow
2313Heikkinen, Miss. LainaFemale26.000STON/O2. 31012827.9250NaNSlow
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)Female35.01011380353.1000C123Slow
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... Female 38.0 1 \n", "2 Heikkinen, Miss. Laina Female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) Female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked quality \n", "0 0 A/5 21171 7.2500 NaN S low \n", "1 0 PC 17599 71.2833 C85 C low \n", "2 0 STON/O2. 3101282 7.9250 NaN S low \n", "3 0 113803 53.1000 C123 S low \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train[\"quality\"] = \"\"\n", "cond1 = train[\"Sex\"] == \"male\"\n", "cond2 = train[\"Age\"] > 25\n", "# display the values unsatisfied the condition\n", "train[\"quality\"] = train[\"quality\"].where(cond1 & cond2, other=\"low\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedquality
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...Female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. LainaFemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)Female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNSlow
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... Female 38.0 1 \n", "2 Heikkinen, Miss. Laina Female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) Female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked quality \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S low " ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train[\"quality\"] = \"\"\n", "cond1 = train[\"Sex\"] == \"male\"\n", "cond2 = train[\"Age\"] > 25\n", "# mask the values unsatisfied the condition\n", "train[\"quality\"] = train[\"quality\"].mask(cond1 & cond2, other=\"low\")\n", "train.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Outliers (clip)\n" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 10\n", "1 54\n", "2 14\n", "3 300\n", "4 45\n", " ... \n", "188 10\n", "189 10\n", "190 10\n", "191 10\n", "192 10\n", "Name: wine_servings, Length: 193, dtype: int64" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks[\"wine_servings\"].clip(10, 300)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## String & Time\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Change the data type\n" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0Asia
1Albania89132544.9Europe
2Algeria250140.7Africa
3Andorra24513831212.4Europe
4Angola21757455.9Africa
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv(\"data/drinks.csv\")\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings int64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent object\n", "dtype: object" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the data type of each Series\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country string[python]\n", "beer_servings Int64\n", "spirit_servings Int64\n", "wine_servings Int64\n", "total_litres_of_pure_alcohol Float64\n", "continent string[python]\n", "dtype: object" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks.convert_dtypes().dtypes" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings float64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent object\n", "dtype: object" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# change the data type of an existing Series\n", "drinks[\"beer_servings\"] = drinks[\"beer_servings\"].astype(float)\n", "drinks.dtypes" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Use string methods\n" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idquantityitem_namechoice_descriptionitem_price
011Chips and Fresh Tomato SalsaNaN$2.39
111Izze[Clementine]$3.39
211Nantucket Nectar[Apple]$3.39
311Chips and Tomatillo-Green Chili SalsaNaN$2.39
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
\n", "
" ], "text/plain": [ " order_id quantity item_name \\\n", "0 1 1 Chips and Fresh Tomato Salsa \n", "1 1 1 Izze \n", "2 1 1 Nantucket Nectar \n", "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", "4 2 2 Chicken Bowl \n", "\n", " choice_description item_price \n", "0 NaN $2.39 \n", "1 [Clementine] $3.39 \n", "2 [Apple] $3.39 \n", "3 NaN $2.39 \n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 " ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "orders = pd.read_csv(\"data/chipotle.tsv\", sep=\"\\t\")\n", "orders.head()" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 CHIPS AND FRESH TOMATO SALSA\n", "1 IZZE\n", "2 NANTUCKET NECTAR\n", "3 CHIPS AND TOMATILLO-GREEN CHILI SALSA\n", "4 CHICKEN BOWL\n", "Name: item_name, dtype: object" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# string methods for pandas Series are accessed via 'str'\n", "orders.item_name.str.upper().head()" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idquantityitem_namechoice_descriptionitem_price
422Chicken Bowl[Tomatillo-Red Chili Salsa (Hot), [Black Beans...$16.98
531Chicken Bowl[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...$10.98
1161Chicken Crispy Tacos[Roasted Chili Corn Salsa, [Fajita Vegetables,...$8.75
1261Chicken Soft Tacos[Roasted Chili Corn Salsa, [Rice, Black Beans,...$8.75
1371Chicken Bowl[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...$11.25
\n", "
" ], "text/plain": [ " order_id quantity item_name \\\n", "4 2 2 Chicken Bowl \n", "5 3 1 Chicken Bowl \n", "11 6 1 Chicken Crispy Tacos \n", "12 6 1 Chicken Soft Tacos \n", "13 7 1 Chicken Bowl \n", "\n", " choice_description item_price \n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 \n", "5 [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... $10.98 \n", "11 [Roasted Chili Corn Salsa, [Fajita Vegetables,... $8.75 \n", "12 [Roasted Chili Corn Salsa, [Rice, Black Beans,... $8.75 \n", "13 [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... $11.25 " ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the boolean Series to filter the DataFrame\n", "orders[orders.item_name.str.contains(\"Chicken\")].head()" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 NaN\n", "1 Clementine\n", "2 Apple\n", "3 NaN\n", "4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ...\n", "Name: choice_description, dtype: object" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# string methods can be chained together\n", "orders.choice_description.str.replace(\"[\", \"\").str.replace(\"]\", \"\").head()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 NaN\n", "1 [Clementine]\n", "2 [Apple]\n", "3 NaN\n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans...\n", "Name: choice_description, dtype: object" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# many pandas string methods support regular expressions (regex)\n", "orders.choice_description.str.replace(r\"[\\[\\]]\", \"\").head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Dates & Times\n" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0IthacaNaNTRIANGLENY6/1/1930 22:00
1WillingboroNaNOTHERNJ6/30/1930 20:00
2HolyokeNaNOVALCO2/15/1931 14:00
3AbileneNaNDISKKS6/1/1931 13:00
4New York Worlds FairNaNLIGHTNY4/18/1933 19:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read a dataset of UFO reports into a DataFrame\n", "ufo = pd.read_csv(\"data/ufo.csv\")\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time object\n", "dtype: object" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'Time' is currently stored as a string\n", "ufo.dtypes" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0IthacaNaNTRIANGLENY1930-06-01 22:00:00
1WillingboroNaNOTHERNJ1930-06-30 20:00:00
2HolyokeNaNOVALCO1931-02-15 14:00:00
3AbileneNaNDISKKS1931-06-01 13:00:00
4New York Worlds FairNaNLIGHTNY1933-04-18 19:00:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State \\\n", "0 Ithaca NaN TRIANGLE NY \n", "1 Willingboro NaN OTHER NJ \n", "2 Holyoke NaN OVAL CO \n", "3 Abilene NaN DISK KS \n", "4 New York Worlds Fair NaN LIGHT NY \n", "\n", " Time \n", "0 1930-06-01 22:00:00 \n", "1 1930-06-30 20:00:00 \n", "2 1931-02-15 14:00:00 \n", "3 1931-06-01 13:00:00 \n", "4 1933-04-18 19:00:00 " ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# convert 'Time' to datetime format\n", "ufo[\"Time\"] = pd.to_datetime(ufo.Time)\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time datetime64[ns]\n", "dtype: object" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.dtypes" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedState
Time
1930-06-01 22:00:00IthacaNaNTRIANGLENY
1930-06-30 20:00:00WillingboroNaNOTHERNJ
1931-02-15 14:00:00HolyokeNaNOVALCO
1931-06-01 13:00:00AbileneNaNDISKKS
1933-04-18 19:00:00New York Worlds FairNaNLIGHTNY
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "Time \n", "1930-06-01 22:00:00 Ithaca NaN TRIANGLE NY\n", "1930-06-30 20:00:00 Willingboro NaN OTHER NJ\n", "1931-02-15 14:00:00 Holyoke NaN OVAL CO\n", "1931-06-01 13:00:00 Abilene NaN DISK KS\n", "1933-04-18 19:00:00 New York Worlds Fair NaN LIGHT NY" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo = ufo.set_index(\"Time\")\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedState
Time
1943-06-01 15:00:00NebraskaNaNDISKNE
1947-07-15 15:00:00ClintwoodNaNCYLINDERVA
1947-07-15 15:00:00HazeltonNaNDISKID
1949-04-10 15:00:00SeattleNaNDISKWA
1949-06-30 15:00:00KnoxvilleNaNOTHERTN
...............
2000-10-12 15:00:00Mountain CityNaNDISKTN
2000-10-28 15:00:00KintnersvilleORANGEOTHERPA
2000-11-20 15:00:00St. JosephNaNEGGMO
2000-12-10 15:00:00MarshfieldNaNLIGHTWI
2000-12-27 15:00:00Kea'auNaNSPHEREHI
\n", "

254 rows × 4 columns

\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "Time \n", "1943-06-01 15:00:00 Nebraska NaN DISK NE\n", "1947-07-15 15:00:00 Clintwood NaN CYLINDER VA\n", "1947-07-15 15:00:00 Hazelton NaN DISK ID\n", "1949-04-10 15:00:00 Seattle NaN DISK WA\n", "1949-06-30 15:00:00 Knoxville NaN OTHER TN\n", "... ... ... ... ...\n", "2000-10-12 15:00:00 Mountain City NaN DISK TN\n", "2000-10-28 15:00:00 Kintnersville ORANGE OTHER PA\n", "2000-11-20 15:00:00 St. Joseph NaN EGG MO\n", "2000-12-10 15:00:00 Marshfield NaN LIGHT WI\n", "2000-12-27 15:00:00 Kea'au NaN SPHERE HI\n", "\n", "[254 rows x 4 columns]" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.at_time(\"15:00\")" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedState
Time
1941-07-02 11:30:00Forest HomeNaNCIRCLECA
1943-10-15 11:00:00OwensboroNaNRECTANGLEKY
1944-01-01 10:00:00WildernessNaNDISKWV
1944-01-01 12:00:00San DiegoNaNCIGARCA
1944-01-01 12:00:00WildernessNaNDISKWV
...............
2000-12-27 10:00:00Santa PaulaNaNLIGHTCA
2000-12-28 11:00:00CerrililloNaNNaNNM
2000-12-28 12:00:00MansfieldNaNFLASHTX
2000-12-30 10:00:00Simi ValleyNaNFORMATIONCA
2000-12-31 11:45:00ChicagoNaNDISKIL
\n", "

781 rows × 4 columns

\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "Time \n", "1941-07-02 11:30:00 Forest Home NaN CIRCLE CA\n", "1943-10-15 11:00:00 Owensboro NaN RECTANGLE KY\n", "1944-01-01 10:00:00 Wilderness NaN DISK WV\n", "1944-01-01 12:00:00 San Diego NaN CIGAR CA\n", "1944-01-01 12:00:00 Wilderness NaN DISK WV\n", "... ... ... ... ...\n", "2000-12-27 10:00:00 Santa Paula NaN LIGHT CA\n", "2000-12-28 11:00:00 Cerrilillo NaN NaN NM\n", "2000-12-28 12:00:00 Mansfield NaN FLASH TX\n", "2000-12-30 10:00:00 Simi Valley NaN FORMATION CA\n", "2000-12-31 11:45:00 Chicago NaN DISK IL\n", "\n", "[781 rows x 4 columns]" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.between_time(\"09:45\", \"12:00\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Category data\n" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0Asia
1Albania89132544.9Europe
2Algeria250140.7Africa
3Andorra24513831212.4Europe
4Angola21757455.9Africa
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv(\"data/drinks.csv\")\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country string[python]\n", "beer_servings Int64\n", "spirit_servings Int64\n", "wine_servings Int64\n", "total_litres_of_pure_alcohol Float64\n", "continent string[python]\n", "dtype: object" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = drinks.convert_dtypes()\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index 132\n", "country 11044\n", "beer_servings 1737\n", "spirit_servings 1737\n", "wine_servings 1737\n", "total_litres_of_pure_alcohol 1737\n", "continent 10788\n", "dtype: int64" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate the memory usage for each Series (in bytes)\n", "drinks.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "country string[python]\n", "beer_servings Int64\n", "spirit_servings Int64\n", "wine_servings Int64\n", "total_litres_of_pure_alcohol Float64\n", "continent category\n", "dtype: object" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'category' data type to store the 'continent' strings as integers\n", "drinks[\"continent\"] = drinks[\"continent\"].astype(\"category\")\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 2\n", "2 0\n", "3 2\n", "4 0\n", "dtype: int8" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)\n", "drinks[\"continent\"].cat.codes.head()" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index 132\n", "country 11044\n", "beer_servings 1737\n", "spirit_servings 1737\n", "wine_servings 1737\n", "total_litres_of_pure_alcohol 1737\n", "continent 708\n", "dtype: int64" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# memory usage has been drastically reduced\n", "drinks.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index 132\n", "country 15598\n", "beer_servings 1737\n", "spirit_servings 1737\n", "wine_servings 1737\n", "total_litres_of_pure_alcohol 1737\n", "continent 708\n", "dtype: int64" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# repeat this process for the 'country' Series\n", "drinks[\"country\"] = drinks[\"country\"].astype(\"category\")\n", "drinks.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n", " 'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',\n", " ...\n", " 'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan',\n", " 'Vanuatu', 'Venezuela', 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],\n", " dtype='string', length=193)" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# memory usage increased because we created 193 categories\n", "drinks[\"country\"].cat.categories" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Dummy variables\n" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(\"data/titanic_train.csv\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
femalemale
0FalseTrue
1TrueFalse
2TrueFalse
3TrueFalse
4FalseTrue
\n", "
" ], "text/plain": [ " female male\n", "0 False True\n", "1 True False\n", "2 True False\n", "3 True False\n", "4 False True" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use 'get_dummies' to create one column for every possible value\n", "pd.get_dummies(train.Sex).head()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sex_male
0True
1False
2False
3False
4True
\n", "
" ], "text/plain": [ " Sex_male\n", "0 True\n", "1 False\n", "2 False\n", "3 False\n", "4 True" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the first dummy variable ('female') using the 'iloc' method\n", "# add a prefix to identify the source of the dummy variables\n", "pd.get_dummies(train.Sex, prefix=\"Sex\").iloc[:, 1:].head()" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Embarked_CEmbarked_QEmbarked_S
0FalseFalseTrue
1TrueFalseFalse
2FalseFalseTrue
3FalseFalseTrue
4FalseFalseTrue
5FalseTrueFalse
6FalseFalseTrue
7FalseFalseTrue
8FalseFalseTrue
9TrueFalseFalse
\n", "
" ], "text/plain": [ " Embarked_C Embarked_Q Embarked_S\n", "0 False False True\n", "1 True False False\n", "2 False False True\n", "3 False False True\n", "4 False False True\n", "5 False True False\n", "6 False False True\n", "7 False False True\n", "8 False False True\n", "9 True False False" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use 'get_dummies' with a feature that has 3 possible values\n", "pd.get_dummies(train.Embarked, prefix=\"Embarked\").head(10)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Embarked_QEmbarked_S
0FalseTrue
1FalseFalse
2FalseTrue
3FalseTrue
4FalseTrue
5TrueFalse
6FalseTrue
7FalseTrue
8FalseTrue
9FalseFalse
\n", "
" ], "text/plain": [ " Embarked_Q Embarked_S\n", "0 False True\n", "1 False False\n", "2 False True\n", "3 False True\n", "4 False True\n", "5 True False\n", "6 False True\n", "7 False True\n", "8 False True\n", "9 False False" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the first dummy variable ('C')\n", "pd.get_dummies(train.Embarked, prefix=\"Embarked\").iloc[:, 1:].head(10)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedEmbarked_QEmbarked_S
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNSFalseTrue
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85CFalseFalse
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNSFalseTrue
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123SFalseTrue
4503Allen, Mr. William Henrymale35.0003734508.0500NaNSFalseTrue
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked Embarked_Q Embarked_S \n", "0 0 A/5 21171 7.2500 NaN S False True \n", "1 0 PC 17599 71.2833 C85 C False False \n", "2 0 STON/O2. 3101282 7.9250 NaN S False True \n", "3 0 113803 53.1000 C123 S False True \n", "4 0 373450 8.0500 NaN S False True " ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save the DataFrame of dummy variables and concatenate them to the original DataFrame\n", "embarked_dummies = pd.get_dummies(train.Embarked, prefix=\"Embarked\").iloc[:, 1:]\n", "train = pd.concat([train, embarked_dummies], axis=1)\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train = pd.read_csv(\"data/titanic_train.csv\")\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameAgeSibSpParchTicketFareCabinSex_femaleSex_maleEmbarked_CEmbarked_QEmbarked_S
0103Braund, Mr. Owen Harris22.010A/5 211717.2500NaNFalseTrueFalseFalseTrue
1211Cumings, Mrs. John Bradley (Florence Briggs Th...38.010PC 1759971.2833C85TrueFalseTrueFalseFalse
2313Heikkinen, Miss. Laina26.000STON/O2. 31012827.9250NaNTrueFalseFalseFalseTrue
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)35.01011380353.1000C123TrueFalseFalseFalseTrue
4503Allen, Mr. William Henry35.0003734508.0500NaNFalseTrueFalseFalseTrue
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Age SibSp Parch \\\n", "0 Braund, Mr. Owen Harris 22.0 1 0 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 \n", "2 Heikkinen, Miss. Laina 26.0 0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 \n", "4 Allen, Mr. William Henry 35.0 0 0 \n", "\n", " Ticket Fare Cabin Sex_female Sex_male Embarked_C \\\n", "0 A/5 21171 7.2500 NaN False True False \n", "1 PC 17599 71.2833 C85 True False True \n", "2 STON/O2. 3101282 7.9250 NaN True False False \n", "3 113803 53.1000 C123 True False False \n", "4 373450 8.0500 NaN False True False \n", "\n", " Embarked_Q Embarked_S \n", "0 False True \n", "1 False False \n", "2 False True \n", "3 False True \n", "4 False True " ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)\n", "pd.get_dummies(train, columns=[\"Sex\", \"Embarked\"]).head()" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameAgeSibSpParchTicketFareCabinSex_maleEmbarked_QEmbarked_S
0103Braund, Mr. Owen Harris22.010A/5 211717.2500NaNTrueFalseTrue
1211Cumings, Mrs. John Bradley (Florence Briggs Th...38.010PC 1759971.2833C85FalseFalseFalse
2313Heikkinen, Miss. Laina26.000STON/O2. 31012827.9250NaNFalseFalseTrue
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)35.01011380353.1000C123FalseFalseTrue
4503Allen, Mr. William Henry35.0003734508.0500NaNTrueFalseTrue
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Age SibSp Parch \\\n", "0 Braund, Mr. Owen Harris 22.0 1 0 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 \n", "2 Heikkinen, Miss. Laina 26.0 0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 \n", "4 Allen, Mr. William Henry 35.0 0 0 \n", "\n", " Ticket Fare Cabin Sex_male Embarked_Q Embarked_S \n", "0 A/5 21171 7.2500 NaN True False True \n", "1 PC 17599 71.2833 C85 False False False \n", "2 STON/O2. 3101282 7.9250 NaN False False True \n", "3 113803 53.1000 C123 False False True \n", "4 373450 8.0500 NaN True False True " ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature\n", "pd.get_dummies(train, columns=[\"Sex\", \"Embarked\"], drop_first=True).head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Cardinal data\n" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDquality
0100good
1101very good
2102good
3103excellent
\n", "
" ], "text/plain": [ " ID quality\n", "0 100 good\n", "1 101 very good\n", "2 102 good\n", "3 103 excellent" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a small DataFrame from a dictionary\n", "small = pd.DataFrame(\n", " {\"ID\": [100, 101, 102, 103], \"quality\": [\"good\", \"very good\", \"good\", \"excellent\"]}\n", ")\n", "small.head()" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDquality
3103excellent
0100good
2102good
1101very good
\n", "
" ], "text/plain": [ " ID quality\n", "3 103 excellent\n", "0 100 good\n", "2 102 good\n", "1 101 very good" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the DataFrame by the 'quality' Series (alphabetical order)\n", "small.sort_values(\"quality\")" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "from pandas.api.types import CategoricalDtype" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 good\n", "1 very good\n", "2 good\n", "3 excellent\n", "Name: quality, dtype: category\n", "Categories (3, object): ['good' < 'very good' < 'excellent']" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# define a logical ordering for the categories\n", "cats = [\"good\", \"very good\", \"excellent\"]\n", "cat_type = CategoricalDtype(categories=cats, ordered=True)\n", "small[\"quality\"] = small.quality.astype(cat_type)\n", "small.quality" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDquality
0100good
2102good
1101very good
3103excellent
\n", "
" ], "text/plain": [ " ID quality\n", "0 100 good\n", "2 102 good\n", "1 101 very good\n", "3 103 excellent" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the DataFrame by the 'quality' Series (logical order)\n", "small.sort_values(\"quality\")" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDquality
1101very good
3103excellent
\n", "
" ], "text/plain": [ " ID quality\n", "1 101 very good\n", "3 103 excellent" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# comparison operators work with ordered categories\n", "small.loc[small.quality > \"good\", :]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Sample rows\n" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedState
Time
1952-09-15 14:00:00NorridgewockNaNDISKME
1998-10-01 21:15:00IpavaNaNTRIANGLEIL
2000-11-13 22:00:00EllinwoodNaNFIREBALLKS
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "Time \n", "1952-09-15 14:00:00 Norridgewock NaN DISK ME\n", "1998-10-01 21:15:00 Ipava NaN TRIANGLE IL\n", "2000-11-13 22:00:00 Ellinwood NaN FIREBALL KS" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'random_state' parameter for reproducibility\n", "ufo.sample(n=3, random_state=42)" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "# sample 75% of the DataFrame's rows without replacement\n", "train = ufo.sample(frac=0.75, random_state=99)" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedState
Time
1933-04-18 19:00:00New York Worlds FairNaNLIGHTNY
1934-09-15 15:30:00Valley CityNaNDISKND
1936-10-15 17:00:00EklutnaNaNCIGARAK
1939-06-01 20:00:00WaterlooNaNFIREBALLAL
1939-07-07 02:00:00KeokukNaNOVALIA
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "Time \n", "1933-04-18 19:00:00 New York Worlds Fair NaN LIGHT NY\n", "1934-09-15 15:30:00 Valley City NaN DISK ND\n", "1936-10-15 17:00:00 Eklutna NaN CIGAR AK\n", "1939-06-01 20:00:00 Waterloo NaN FIREBALL AL\n", "1939-07-07 02:00:00 Keokuk NaN OVAL IA" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# store the remaining 25% of the rows in another DataFrame\n", "test = ufo.loc[~ufo.index.isin(train.index), :]\n", "test.head()" ] } ], "metadata": { "kernelspec": { "display_name": "dox", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }