读写和过滤#
import pandas as pd
Read & Info#
Read tabular data#
user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv("data/movie.user", sep="|", header=None, names=user_cols)
# examine the first 5 rows
users.head()
user_id | age | gender | occupation | zip_code | |
---|---|---|---|---|---|
0 | 1 | 24 | M | technician | 85711 |
1 | 2 | 53 | F | other | 94043 |
2 | 3 | 23 | M | writer | 32067 |
3 | 4 | 24 | M | technician | 43537 |
4 | 5 | 33 | F | other | 15213 |
# read_csv is equivalent to read_csv, except it assumes a comma separator
ufo = pd.read_csv("data//ufo.csv")
# examine the first 5 rows
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# specify which columns to include by name
pd.read_csv("data/ufo.csv", usecols=["City", "State"]).head()
City | State | |
---|---|---|
0 | Ithaca | NY |
1 | Willingboro | NJ |
2 | Holyoke | CO |
3 | Abilene | KS |
4 | New York Worlds Fair | NY |
# specify columns by position
pd.read_csv("data/ufo.csv", usecols=[0, 4]).head()
City | Time | |
---|---|---|
0 | Ithaca | 6/1/1930 22:00 |
1 | Willingboro | 6/30/1930 20:00 |
2 | Holyoke | 2/15/1931 14:00 |
3 | Abilene | 6/1/1931 13:00 |
4 | New York Worlds Fair | 4/18/1933 19:00 |
# specify how many rows to read
pd.read_csv("data/ufo.csv", nrows=3).head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
Describe data#
movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# example attribute: number of rows and columns
movies.shape
(979, 6)
# example attribute: data type of each column
movies.dtypes
star_rating float64
title object
content_rating object
genre object
duration int64
actors_list object
dtype: object
# example method: calculate summary statistics
movies.describe()
star_rating | duration | |
---|---|---|
count | 979.000000 | 979.000000 |
mean | 7.889785 | 120.979571 |
std | 0.336069 | 26.218010 |
min | 7.400000 | 64.000000 |
25% | 7.600000 | 102.000000 |
50% | 7.800000 | 117.000000 |
75% | 8.100000 | 134.000000 |
max | 9.300000 | 242.000000 |
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=["object"])
title | content_rating | genre | actors_list | |
---|---|---|---|---|
count | 979 | 976 | 979 | 979 |
unique | 975 | 12 | 16 | 969 |
top | Dracula | R | Drama | [u'Daniel Radcliffe', u'Emma Watson', u'Rupert... |
freq | 2 | 460 | 278 | 6 |
Column Manipulation#
Create a new column#
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo["Location"] = f"{ufo.City}, {ufo.State}"
ufo.head()
City | Colors Reported | Shape Reported | State | Time | Location | |
---|---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 | 0 Ithaca\n1 ... |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 | 0 Ithaca\n1 ... |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 | 0 Ithaca\n1 ... |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 | 0 Ithaca\n1 ... |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 | 0 Ithaca\n1 ... |
Rename columns#
ufo = pd.read_csv("data/ufo.csv")
# examine the column names
ufo.columns
Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')
# rename two of the columns by using the 'rename' method
ufo = ufo.rename(
columns={"Colors Reported": "Colors_Reported", "Shape Reported": "Shape_Reported"}
)
ufo.columns
Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ["city", "colors reported", "shape reported", "state", "time"]
ufo.columns = ufo_cols
ufo.columns
Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv("data/ufo.csv", header=0, names=ufo_cols)
ufo.columns
Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(" ", "_")
ufo.columns
Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')
Remove columns#
ufo = pd.read_csv("data/ufo.csv")
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# remove a single column (axis=1 refers to columns)
ufo = ufo.drop("Colors Reported", axis=1)
ufo.head()
City | Shape Reported | State | Time | |
---|---|---|---|---|
0 | Ithaca | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | LIGHT | NY | 4/18/1933 19:00 |
# remove multiple columns at once
ufo.drop(columns=["City", "State"]).head()
Shape Reported | Time | |
---|---|---|
0 | TRIANGLE | 6/1/1930 22:00 |
1 | OTHER | 6/30/1930 20:00 |
2 | OVAL | 2/15/1931 14:00 |
3 | DISK | 6/1/1931 13:00 |
4 | LIGHT | 4/18/1933 19:00 |
Remove rows#
# new way to drop rows: specify index
ufo.drop(index=[0, 1]).head()
City | Shape Reported | State | Time | |
---|---|---|---|---|
2 | Holyoke | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | LIGHT | NY | 4/18/1933 19:00 |
5 | Valley City | DISK | ND | 9/15/1934 15:30 |
6 | Crater Lake | CIRCLE | CA | 6/15/1935 0:00 |
Select#
Iterate#
ufo = pd.read_csv("data/ufo.csv")
ufo.head()
City | Colors Reported | Shape Reported | State | Time | |
---|---|---|---|---|---|
0 | Ithaca | NaN | TRIANGLE | NY | 6/1/1930 22:00 |
1 | Willingboro | NaN | OTHER | NJ | 6/30/1930 20:00 |
2 | Holyoke | NaN | OVAL | CO | 2/15/1931 14:00 |
3 | Abilene | NaN | DISK | KS | 6/1/1931 13:00 |
4 | New York Worlds Fair | NaN | LIGHT | NY | 4/18/1933 19:00 |
# various methods are available to iterate through a DataFrame
for index, row in ufo[:10].iterrows():
print(index, row.City, row.State)
0 Ithaca NY
1 Willingboro NJ
2 Holyoke CO
3 Abilene KS
4 New York Worlds Fair NY
5 Valley City ND
6 Crater Lake CA
7 Alma MI
8 Eklutna AK
9 Hubbard OR
Index#
# index and columns both default to integers if you don't define them
pd.read_csv("data/drinks.csv", header=None).head()
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent |
1 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
2 | Albania | 89 | 132 | 54 | 4.9 | Europe |
3 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
4 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
RangeIndex(start=0, stop=193, step=1)
# set an existing column as the index
drinks = drinks.set_index("country")
drinks.head()
# 'country' is now the index
drinks.index
Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
...
'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
dtype='object', name='country', length=193)
# restore the index name, and move the index back to a column
drinks.index.name = "country"
drinks = drinks.reset_index()
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
Select a column#
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# select the 'City' Series using bracket notation
drinks["country"]
# or equivalently, use dot notation
drinks.country
0 Afghanistan
1 Albania
2 Algeria
3 Andorra
4 Angola
...
188 Venezuela
189 Vietnam
190 Yemen
191 Zambia
192 Zimbabwe
Name: country, Length: 193, dtype: object
drink = "wine"
drinks[f"{drink}_servings"]
0 0
1 54
2 14
3 312
4 45
...
188 3
189 1
190 0
191 4
192 4
Name: wine_servings, Length: 193, dtype: int64
Select numeric columns#
drinks.dtypes
country object
beer_servings int64
spirit_servings int64
wine_servings int64
total_litres_of_pure_alcohol float64
continent object
dtype: object
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes
beer_servings int64
spirit_servings int64
wine_servings int64
total_litres_of_pure_alcohol float64
dtype: object
drinks.select_dtypes(include=[np.number]).squeeze("columns")
beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 0.0 |
1 | 89 | 132 | 54 | 4.9 |
2 | 25 | 0 | 14 | 0.7 |
3 | 245 | 138 | 312 | 12.4 |
4 | 217 | 57 | 45 | 5.9 |
... | ... | ... | ... | ... |
188 | 333 | 100 | 3 | 7.7 |
189 | 111 | 2 | 1 | 2.0 |
190 | 6 | 0 | 0 | 0.1 |
191 | 32 | 19 | 4 | 2.5 |
192 | 64 | 18 | 4 | 4.7 |
193 rows × 4 columns
Select rows and columns#
# row 0, all columns
drinks.loc[0, :]
country Afghanistan
beer_servings 0
spirit_servings 0
wine_servings 0
total_litres_of_pure_alcohol 0.0
continent Asia
Name: 0, dtype: object
# rows 0 and 1 and 2, all columns
drinks.loc[[0, 1, 2], :]
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
# rows 0 through 2 (inclusive), all columns
drinks.loc[0:2, :]
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
# rows 0 through 2 (inclusive), column 'City'
drinks.loc[0:2, "country"]
0 Afghanistan
1 Albania
2 Algeria
Name: country, dtype: object
# rows 0 through 2 (inclusive), columns 'City' and 'State'
drinks.loc[0:2, ["country", "continent"]]
country | continent | |
---|---|---|
0 | Afghanistan | Asia |
1 | Albania | Europe |
2 | Algeria | Africa |
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
drinks.loc[0:2, "country":"continent"]
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
# rows in positions 0 and 1, columns in positions 0 and 3
drinks.iloc[[0, 1], [0, 3]]
country | wine_servings | |
---|---|---|
0 | Afghanistan | 0 |
1 | Albania | 54 |
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
drinks.iloc[0:2, 0:4]
country | beer_servings | spirit_servings | wine_servings | |
---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 |
1 | Albania | 89 | 132 | 54 |
# rows in positions 0 through 2 (exclusive), all columns
drinks.iloc[0:2, :]
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
Filter#
Column filter#
drinks.filter(items=["beer_servings", "continent"])
beer_servings | continent | |
---|---|---|
0 | 0 | Asia |
1 | 89 | Europe |
2 | 25 | Africa |
3 | 245 | Europe |
4 | 217 | Africa |
... | ... | ... |
188 | 333 | South America |
189 | 111 | Asia |
190 | 6 | Asia |
191 | 32 | Africa |
192 | 64 | Africa |
193 rows × 2 columns
drinks.filter(regex="servings", axis=1)
beer_servings | spirit_servings | wine_servings | |
---|---|---|---|
0 | 0 | 0 | 0 |
1 | 89 | 132 | 54 |
2 | 25 | 0 | 14 |
3 | 245 | 138 | 312 |
4 | 217 | 57 | 45 |
... | ... | ... | ... |
188 | 333 | 100 | 3 |
189 | 111 | 2 | 1 |
190 | 6 | 0 | 0 |
191 | 32 | 19 | 4 |
192 | 64 | 18 | 4 |
193 rows × 3 columns
Index filter#
drinks.filter(like="2", axis=0).head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
12 | Bahrain | 42 | 63 | 7 | 2.0 | Asia |
20 | Bolivia | 167 | 41 | 8 | 3.8 | South America |
21 | Bosnia-Herzegovina | 76 | 173 | 8 | 4.6 | Europe |
22 | Botswana | 173 | 35 | 35 | 5.4 | Africa |
Value filter#
drinks.loc[drinks.continent == "South America"]
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
6 | Argentina | 193 | 25 | 221 | 8.3 | South America |
20 | Bolivia | 167 | 41 | 8 | 3.8 | South America |
23 | Brazil | 245 | 145 | 16 | 7.2 | South America |
35 | Chile | 130 | 124 | 172 | 7.6 | South America |
37 | Colombia | 159 | 76 | 3 | 4.2 | South America |
52 | Ecuador | 162 | 74 | 3 | 4.2 | South America |
72 | Guyana | 93 | 302 | 1 | 7.1 | South America |
132 | Paraguay | 213 | 117 | 74 | 7.3 | South America |
133 | Peru | 163 | 160 | 21 | 6.1 | South America |
163 | Suriname | 128 | 178 | 7 | 5.6 | South America |
185 | Uruguay | 115 | 35 | 220 | 6.6 | South America |
188 | Venezuela | 333 | 100 | 3 | 7.7 | South America |
drinks.loc[drinks.continent == "South America", "wine_servings"]
6 221
20 8
23 16
35 172
37 3
52 3
72 1
132 74
133 21
163 7
185 220
188 3
Name: wine_servings, dtype: int64
drinks.query("continent=='South America'")["wine_servings"]
6 221
20 8
23 16
35 172
37 3
52 3
72 1
132 74
133 21
163 7
185 220
188 3
Name: wine_servings, dtype: int64
movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# use the '&' operator to specify that both conditions are required
movies.query("duration >=200 & genre == 'Drama'")
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
157 | 8.2 | Gone with the Wind | G | Drama | 238 | [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... |
476 | 7.8 | Hamlet | PG-13 | Drama | 242 | [u'Kenneth Branagh', u'Julie Christie', u'Dere... |
# using the '|' operator would have shown movies that are either long or dramas (or both)
movies.query("duration >=200 | genre == 'Drama'").head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
13 | 8.8 | Forrest Gump | PG-13 | Drama | 142 | [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] |
Range filter#
movies.loc[movies["genre"].isin(["Crime", "Drama", "Action"])].head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
movies.loc[movies["duration"].between(100, 200, inclusive="neither")].head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
6 | 8.9 | The Good, the Bad and the Ugly | NOT RATED | Western | 161 | [u'Clint Eastwood', u'Eli Wallach', u'Lee Van ... |
Reshape#
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks.loc[:, "beer_servings":"wine_servings"].apply(max, axis=0)
beer_servings 376
spirit_servings 438
wine_servings 370
dtype: int64
# convert every DataFrame element into a float
drinks.loc[:, "beer_servings":"wine_servings"].applymap(float).head()
/tmp/ipykernel_1822/1461345987.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
drinks.loc[:, "beer_servings":"wine_servings"].applymap(float).head()
beer_servings | spirit_servings | wine_servings | |
---|---|---|---|
0 | 0.0 | 0.0 | 0.0 |
1 | 89.0 | 132.0 | 54.0 |
2 | 25.0 | 0.0 | 14.0 |
3 | 245.0 | 138.0 | 312.0 |
4 | 217.0 | 57.0 | 45.0 |
# overwrite the existing DataFrame columns
drinks.loc[:, "beer_servings":"wine_servings"] = drinks.loc[
:, "beer_servings":"wine_servings"
].applymap(float)
drinks.head()
/tmp/ipykernel_1822/3081965367.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
].applymap(float)
country | beer_servings | spirit_servings | wine_servings | total_litres_of_pure_alcohol | continent | |
---|---|---|---|---|---|---|
0 | Afghanistan | 0 | 0 | 0 | 0.0 | Asia |
1 | Albania | 89 | 132 | 54 | 4.9 | Europe |
2 | Algeria | 25 | 0 | 14 | 0.7 | Africa |
3 | Andorra | 245 | 138 | 312 | 12.4 | Europe |
4 | Angola | 217 | 57 | 45 | 5.9 | Africa |
Concat#
df1 = pd.DataFrame(
{
"name": ["A", "B", "C", "D"],
"math": [60, 89, 82, 70],
"physics": [66, 95, 83, 66],
"chemistry": [61, 91, 77, 70],
}
)
df2 = pd.DataFrame(
{
"name": ["E", "F", "G", "H"],
"math": [66, 95, 83, 66],
"physics": [60, 89, 82, 70],
"chemistry": [90, 81, 78, 90],
}
)
pd.concat([df1, df2])
name | math | physics | chemistry | |
---|---|---|---|---|
0 | A | 60 | 66 | 61 |
1 | B | 89 | 95 | 91 |
2 | C | 82 | 83 | 77 |
3 | D | 70 | 66 | 70 |
0 | E | 66 | 60 | 90 |
1 | F | 95 | 89 | 81 |
2 | G | 83 | 82 | 78 |
3 | H | 66 | 70 | 90 |
pd.concat([df1, df2], ignore_index=True)
name | math | physics | chemistry | |
---|---|---|---|---|
0 | A | 60 | 66 | 61 |
1 | B | 89 | 95 | 91 |
2 | C | 82 | 83 | 77 |
3 | D | 70 | 66 | 70 |
4 | E | 66 | 60 | 90 |
5 | F | 95 | 89 | 81 |
6 | G | 83 | 82 | 78 |
7 | H | 66 | 70 | 90 |
pd.concat([df1, df2], axis=1)
name | math | physics | chemistry | name | math | physics | chemistry | |
---|---|---|---|---|---|---|---|---|
0 | A | 60 | 66 | 61 | E | 66 | 60 | 90 |
1 | B | 89 | 95 | 91 | F | 95 | 89 | 81 |
2 | C | 82 | 83 | 77 | G | 83 | 82 | 78 |
3 | D | 70 | 66 | 70 | H | 66 | 70 | 90 |
pd.concat([df1, df2], keys=["Year 1", "Year 2"], names=["Class", None]).reset_index(
level=0
)
Class | name | math | physics | chemistry | |
---|---|---|---|---|---|
0 | Year 1 | A | 60 | 66 | 61 |
1 | Year 1 | B | 89 | 95 | 91 |
2 | Year 1 | C | 82 | 83 | 77 |
3 | Year 1 | D | 70 | 66 | 70 |
0 | Year 2 | E | 66 | 60 | 90 |
1 | Year 2 | F | 95 | 89 | 81 |
2 | Year 2 | G | 83 | 82 | 78 |
3 | Year 2 | H | 66 | 70 | 90 |
pd.concat([df1, df2], sort=True)
chemistry | math | name | physics | |
---|---|---|---|---|
0 | 61 | 60 | A | 66 |
1 | 91 | 89 | B | 95 |
2 | 77 | 82 | C | 83 |
3 | 70 | 70 | D | 66 |
0 | 90 | 66 | E | 60 |
1 | 81 | 95 | F | 89 |
2 | 78 | 83 | G | 82 |
3 | 90 | 66 | H | 70 |
Merge#
movie_cols = ["movie_id", "title"]
movies = pd.read_csv(
"data/u.item", sep="|", header=None, names=movie_cols, usecols=[0, 1]
)
movies.head()
movie_id | title | |
---|---|---|
0 | 1 | Toy Story (1995) |
1 | 2 | GoldenEye (1995) |
2 | 3 | Four Rooms (1995) |
3 | 4 | Get Shorty (1995) |
4 | 5 | Copycat (1995) |
movies.shape
(1682, 2)
rating_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv("data/u.data", sep="\t", header=None, names=rating_cols)
ratings.head()
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 196 | 242 | 3 | 881250949 |
1 | 186 | 302 | 3 | 891717742 |
2 | 22 | 377 | 1 | 878887116 |
3 | 244 | 51 | 2 | 880606923 |
4 | 166 | 346 | 1 | 886397596 |
ratings.shape
(100000, 4)
ratings.query("movie_id == 1").head()
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
24 | 308 | 1 | 4 | 887736532 |
454 | 287 | 1 | 5 | 875334088 |
957 | 148 | 1 | 4 | 877019411 |
971 | 280 | 1 | 4 | 891700426 |
1324 | 66 | 1 | 3 | 883601324 |
movie_ratings = movies.merge(ratings)
movie_ratings.columns
Index(['movie_id', 'title', 'user_id', 'rating', 'timestamp'], dtype='object')
movie_ratings.head()
movie_id | title | user_id | rating | timestamp | |
---|---|---|---|---|---|
0 | 1 | Toy Story (1995) | 308 | 4 | 887736532 |
1 | 1 | Toy Story (1995) | 287 | 5 | 875334088 |
2 | 1 | Toy Story (1995) | 148 | 4 | 877019411 |
3 | 1 | Toy Story (1995) | 280 | 4 | 891700426 |
4 | 1 | Toy Story (1995) | 66 | 3 | 883601324 |
print(movies.shape)
print(ratings.shape)
print(movie_ratings.shape)
(1682, 2)
(100000, 4)
(100000, 5)
movies.columns = ["m_id", "title"]
movies.columns
Index(['m_id', 'title'], dtype='object')
ratings.columns
Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')
movies.merge(ratings, left_on="m_id", right_on="movie_id").head()
m_id | title | user_id | movie_id | rating | timestamp | |
---|---|---|---|---|---|---|
0 | 1 | Toy Story (1995) | 308 | 1 | 4 | 887736532 |
1 | 1 | Toy Story (1995) | 287 | 1 | 5 | 875334088 |
2 | 1 | Toy Story (1995) | 148 | 1 | 4 | 877019411 |
3 | 1 | Toy Story (1995) | 280 | 1 | 4 | 891700426 |
4 | 1 | Toy Story (1995) | 66 | 1 | 3 | 883601324 |
movies.merge(ratings, left_index=True, right_on="movie_id").head()
m_id | title | user_id | movie_id | rating | timestamp | |
---|---|---|---|---|---|---|
24 | 2 | GoldenEye (1995) | 308 | 1 | 4 | 887736532 |
454 | 2 | GoldenEye (1995) | 287 | 1 | 5 | 875334088 |
957 | 2 | GoldenEye (1995) | 148 | 1 | 4 | 877019411 |
971 | 2 | GoldenEye (1995) | 280 | 1 | 4 | 891700426 |
1324 | 2 | GoldenEye (1995) | 66 | 1 | 3 | 883601324 |
movies.merge(ratings, left_index=True, right_index=True).head()
m_id | title | user_id | movie_id | rating | timestamp | |
---|---|---|---|---|---|---|
0 | 1 | Toy Story (1995) | 196 | 242 | 3 | 881250949 |
1 | 2 | GoldenEye (1995) | 186 | 302 | 3 | 891717742 |
2 | 3 | Four Rooms (1995) | 22 | 377 | 1 | 878887116 |
3 | 4 | Get Shorty (1995) | 244 | 51 | 2 | 880606923 |
4 | 5 | Copycat (1995) | 166 | 346 | 1 | 886397596 |
Merge with MultiIndexes#
close = pd.read_csv(
"data/stocks.csv", usecols=[0, 1, 3], index_col=["Symbol", "Date"]
).sort_index()
close
Close | ||
---|---|---|
Symbol | Date | |
AAPL | 2016-10-03 | 112.52 |
2016-10-04 | 113.00 | |
2016-10-05 | 113.05 | |
CSCO | 2016-10-03 | 31.50 |
2016-10-04 | 31.35 | |
2016-10-05 | 31.59 | |
MSFT | 2016-10-03 | 57.42 |
2016-10-04 | 57.24 | |
2016-10-05 | 57.64 |
volume = pd.read_csv(
"data/stocks.csv", usecols=[0, 2, 3], index_col=["Symbol", "Date"]
).sort_index()
volume
Volume | ||
---|---|---|
Symbol | Date | |
AAPL | 2016-10-03 | 21701800 |
2016-10-04 | 29736800 | |
2016-10-05 | 21453100 | |
CSCO | 2016-10-03 | 14070500 |
2016-10-04 | 18460400 | |
2016-10-05 | 11808600 | |
MSFT | 2016-10-03 | 19189500 |
2016-10-04 | 20085900 | |
2016-10-05 | 16726400 |
both = close.merge(volume, left_index=True, right_index=True)
both
Close | Volume | ||
---|---|---|---|
Symbol | Date | ||
AAPL | 2016-10-03 | 112.52 | 21701800 |
2016-10-04 | 113.00 | 29736800 | |
2016-10-05 | 113.05 | 21453100 | |
CSCO | 2016-10-03 | 31.50 | 14070500 |
2016-10-04 | 31.35 | 18460400 | |
2016-10-05 | 31.59 | 11808600 | |
MSFT | 2016-10-03 | 57.42 | 19189500 |
2016-10-04 | 57.24 | 20085900 | |
2016-10-05 | 57.64 | 16726400 |
both.reset_index()
Symbol | Date | Close | Volume | |
---|---|---|---|---|
0 | AAPL | 2016-10-03 | 112.52 | 21701800 |
1 | AAPL | 2016-10-04 | 113.00 | 29736800 |
2 | AAPL | 2016-10-05 | 113.05 | 21453100 |
3 | CSCO | 2016-10-03 | 31.50 | 14070500 |
4 | CSCO | 2016-10-04 | 31.35 | 18460400 |
5 | CSCO | 2016-10-05 | 31.59 | 11808600 |
6 | MSFT | 2016-10-03 | 57.42 | 19189500 |
7 | MSFT | 2016-10-04 | 57.24 | 20085900 |
8 | MSFT | 2016-10-05 | 57.64 | 16726400 |
Join#
A = pd.DataFrame({"color": ["green", "yellow", "red"], "num": [1, 2, 3]})
A
color | num | |
---|---|---|
0 | green | 1 |
1 | yellow | 2 |
2 | red | 3 |
B = pd.DataFrame({"color": ["green", "yellow", "pink"], "size": ["S", "M", "L"]})
B
color | size | |
---|---|---|
0 | green | S |
1 | yellow | M |
2 | pink | L |
Inner join: Only include observations found in both A and B:
A.merge(B, how="inner")
color | num | size | |
---|---|---|---|
0 | green | 1 | S |
1 | yellow | 2 | M |
Outer join: Include observations found in either A or B:
A.merge(B, how="outer")
color | num | size | |
---|---|---|---|
0 | green | 1.0 | S |
1 | pink | NaN | L |
2 | red | 3.0 | NaN |
3 | yellow | 2.0 | M |
Left join: Include all observations found in A:
A.merge(B, how="left")
color | num | size | |
---|---|---|---|
0 | green | 1 | S |
1 | yellow | 2 | M |
2 | red | 3 | NaN |
Right join: Include all observations found in B:
A.merge(B, how="right")
color | num | size | |
---|---|---|---|
0 | green | 1.0 | S |
1 | yellow | 2.0 | M |
2 | pink | NaN | L |
Explode#
orders = pd.read_csv("data/chipotle.tsv", sep="\t")
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips and Fresh Tomato Salsa | NaN | $2.39 |
1 | 1 | 1 | Izze | [Clementine] | $3.39 |
2 | 1 | 1 | Nantucket Nectar | [Apple] | $3.39 |
3 | 1 | 1 | Chips and Tomatillo-Green Chili Salsa | NaN | $2.39 |
4 | 2 | 2 | Chicken Bowl | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
orders["item_name"] = orders["item_name"].str.split()
orders.head()
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | [Chips, and, Fresh, Tomato, Salsa] | NaN | $2.39 |
1 | 1 | 1 | [Izze] | [Clementine] | $3.39 |
2 | 1 | 1 | [Nantucket, Nectar] | [Apple] | $3.39 |
3 | 1 | 1 | [Chips, and, Tomatillo-Green, Chili, Salsa] | NaN | $2.39 |
4 | 2 | 2 | [Chicken, Bowl] | [Tomatillo-Red Chili Salsa (Hot), [Black Beans... | $16.98 |
orders.explode("item_name")
order_id | quantity | item_name | choice_description | item_price | |
---|---|---|---|---|---|
0 | 1 | 1 | Chips | NaN | $2.39 |
0 | 1 | 1 | and | NaN | $2.39 |
0 | 1 | 1 | Fresh | NaN | $2.39 |
0 | 1 | 1 | Tomato | NaN | $2.39 |
0 | 1 | 1 | Salsa | NaN | $2.39 |
... | ... | ... | ... | ... | ... |
4620 | 1834 | 1 | Salad | [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... | $8.75 |
4620 | 1834 | 1 | Bowl | [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... | $8.75 |
4621 | 1834 | 1 | Chicken | [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | $8.75 |
4621 | 1834 | 1 | Salad | [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | $8.75 |
4621 | 1834 | 1 | Bowl | [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... | $8.75 |
11501 rows × 5 columns
Display#
movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
movies[:10].style.bar("duration", vmin=60)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.300000 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton'] |
1 | 9.200000 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.100000 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duvall'] |
3 | 9.000000 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart'] |
4 | 8.900000 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson'] |
5 | 8.900000 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam'] |
6 | 8.900000 | The Good, the Bad and the Ugly | NOT RATED | Western | 161 | [u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef'] |
7 | 8.900000 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen'] |
8 | 8.900000 | Schindler's List | R | Biography | 195 | [u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley'] |
9 | 8.900000 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter'] |
movies[:10].style.background_gradient("Greens", subset="star_rating")
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.300000 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton'] |
1 | 9.200000 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.100000 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duvall'] |
3 | 9.000000 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart'] |
4 | 8.900000 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson'] |
5 | 8.900000 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam'] |
6 | 8.900000 | The Good, the Bad and the Ugly | NOT RATED | Western | 161 | [u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef'] |
7 | 8.900000 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen'] |
8 | 8.900000 | Schindler's List | R | Biography | 195 | [u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley'] |
9 | 8.900000 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter'] |
(
movies[:10]
.style.bar("duration", vmin=60)
.background_gradient("Greens", subset="star_rating")
.highlight_null()
.highlight_max()
)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.300000 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton'] |
1 | 9.200000 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.100000 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duvall'] |
3 | 9.000000 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart'] |
4 | 8.900000 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson'] |
5 | 8.900000 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam'] |
6 | 8.900000 | The Good, the Bad and the Ugly | NOT RATED | Western | 161 | [u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef'] |
7 | 8.900000 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen'] |
8 | 8.900000 | Schindler's List | R | Biography | 195 | [u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley'] |
9 | 8.900000 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter'] |