读写和过滤#

import pandas as pd

Read & Info#

Read tabular data#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv("data/movie.user", sep="|", header=None, names=user_cols)
# examine the first 5 rows
users.head()
user_id age gender occupation zip_code
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
3 4 24 M technician 43537
4 5 33 F other 15213
# read_csv is equivalent to read_csv, except it assumes a comma separator
ufo = pd.read_csv("data//ufo.csv")
# examine the first 5 rows
ufo.head()
City Colors Reported Shape Reported State Time
0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00
1 Willingboro NaN OTHER NJ 6/30/1930 20:00
2 Holyoke NaN OVAL CO 2/15/1931 14:00
3 Abilene NaN DISK KS 6/1/1931 13:00
4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00
# specify which columns to include by name
pd.read_csv("data/ufo.csv", usecols=["City", "State"]).head()
City State
0 Ithaca NY
1 Willingboro NJ
2 Holyoke CO
3 Abilene KS
4 New York Worlds Fair NY
# specify columns by position
pd.read_csv("data/ufo.csv", usecols=[0, 4]).head()
City Time
0 Ithaca 6/1/1930 22:00
1 Willingboro 6/30/1930 20:00
2 Holyoke 2/15/1931 14:00
3 Abilene 6/1/1931 13:00
4 New York Worlds Fair 4/18/1933 19:00
# specify how many rows to read
pd.read_csv("data/ufo.csv", nrows=3).head()
City Colors Reported Shape Reported State Time
0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00
1 Willingboro NaN OTHER NJ 6/30/1930 20:00
2 Holyoke NaN OVAL CO 2/15/1931 14:00

Describe data#

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating title content_rating genre duration actors_list
0 9.3 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1 9.2 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.1 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3 9.0 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron E...
4 8.9 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L....
# example attribute: number of rows and columns
movies.shape
(979, 6)
# example attribute: data type of each column
movies.dtypes
star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object
# example method: calculate summary statistics
movies.describe()
star_rating duration
count 979.000000 979.000000
mean 7.889785 120.979571
std 0.336069 26.218010
min 7.400000 64.000000
25% 7.600000 102.000000
50% 7.800000 117.000000
75% 8.100000 134.000000
max 9.300000 242.000000
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=["object"])
title content_rating genre actors_list
count 979 976 979 979
unique 975 12 16 969
top Dracula R Drama [u'Daniel Radcliffe', u'Emma Watson', u'Rupert...
freq 2 460 278 6

Column Manipulation#

Create a new column#

# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo["Location"] = f"{ufo.City}, {ufo.State}"
ufo.head()
City Colors Reported Shape Reported State Time Location
0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00 0 Ithaca\n1 ...
1 Willingboro NaN OTHER NJ 6/30/1930 20:00 0 Ithaca\n1 ...
2 Holyoke NaN OVAL CO 2/15/1931 14:00 0 Ithaca\n1 ...
3 Abilene NaN DISK KS 6/1/1931 13:00 0 Ithaca\n1 ...
4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00 0 Ithaca\n1 ...

Rename columns#

ufo = pd.read_csv("data/ufo.csv")
# examine the column names
ufo.columns
Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')
# rename two of the columns by using the 'rename' method
ufo = ufo.rename(
    columns={"Colors Reported": "Colors_Reported", "Shape Reported": "Shape_Reported"}
)
ufo.columns
Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ["city", "colors reported", "shape reported", "state", "time"]
ufo.columns = ufo_cols
ufo.columns
Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv("data/ufo.csv", header=0, names=ufo_cols)
ufo.columns
Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(" ", "_")
ufo.columns
Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')

Remove columns#

ufo = pd.read_csv("data/ufo.csv")
ufo.head()
City Colors Reported Shape Reported State Time
0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00
1 Willingboro NaN OTHER NJ 6/30/1930 20:00
2 Holyoke NaN OVAL CO 2/15/1931 14:00
3 Abilene NaN DISK KS 6/1/1931 13:00
4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00
# remove a single column (axis=1 refers to columns)
ufo = ufo.drop("Colors Reported", axis=1)
ufo.head()
City Shape Reported State Time
0 Ithaca TRIANGLE NY 6/1/1930 22:00
1 Willingboro OTHER NJ 6/30/1930 20:00
2 Holyoke OVAL CO 2/15/1931 14:00
3 Abilene DISK KS 6/1/1931 13:00
4 New York Worlds Fair LIGHT NY 4/18/1933 19:00
# remove multiple columns at once
ufo.drop(columns=["City", "State"]).head()
Shape Reported Time
0 TRIANGLE 6/1/1930 22:00
1 OTHER 6/30/1930 20:00
2 OVAL 2/15/1931 14:00
3 DISK 6/1/1931 13:00
4 LIGHT 4/18/1933 19:00

Remove rows#

# new way to drop rows: specify index
ufo.drop(index=[0, 1]).head()
City Shape Reported State Time
2 Holyoke OVAL CO 2/15/1931 14:00
3 Abilene DISK KS 6/1/1931 13:00
4 New York Worlds Fair LIGHT NY 4/18/1933 19:00
5 Valley City DISK ND 9/15/1934 15:30
6 Crater Lake CIRCLE CA 6/15/1935 0:00

Select#

Iterate#

ufo = pd.read_csv("data/ufo.csv")
ufo.head()
City Colors Reported Shape Reported State Time
0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00
1 Willingboro NaN OTHER NJ 6/30/1930 20:00
2 Holyoke NaN OVAL CO 2/15/1931 14:00
3 Abilene NaN DISK KS 6/1/1931 13:00
4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00
# various methods are available to iterate through a DataFrame
for index, row in ufo[:10].iterrows():
    print(index, row.City, row.State)
0 Ithaca NY
1 Willingboro NJ
2 Holyoke CO
3 Abilene KS
4 New York Worlds Fair NY
5 Valley City ND
6 Crater Lake CA
7 Alma MI
8 Eklutna AK
9 Hubbard OR

Index#

# index and columns both default to integers if you don't define them
pd.read_csv("data/drinks.csv", header=None).head()
0 1 2 3 4 5
0 country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
1 Afghanistan 0 0 0 0.0 Asia
2 Albania 89 132 54 4.9 Europe
3 Algeria 25 0 14 0.7 Africa
4 Andorra 245 138 312 12.4 Europe
drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
RangeIndex(start=0, stop=193, step=1)
# set an existing column as the index
drinks = drinks.set_index("country")
drinks.head()
# 'country' is now the index
drinks.index
Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='country', length=193)
# restore the index name, and move the index back to a column
drinks.index.name = "country"
drinks = drinks.reset_index()
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa

Select a column#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa
# select the 'City' Series using bracket notation
drinks["country"]
# or equivalently, use dot notation
drinks.country
0      Afghanistan
1          Albania
2          Algeria
3          Andorra
4           Angola
          ...     
188      Venezuela
189        Vietnam
190          Yemen
191         Zambia
192       Zimbabwe
Name: country, Length: 193, dtype: object
drink = "wine"
drinks[f"{drink}_servings"]
0        0
1       54
2       14
3      312
4       45
      ... 
188      3
189      1
190      0
191      4
192      4
Name: wine_servings, Length: 193, dtype: int64

Select numeric columns#

drinks.dtypes
country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object
import numpy as np

drinks.select_dtypes(include=[np.number]).dtypes
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
dtype: object
drinks.select_dtypes(include=[np.number]).squeeze("columns")
beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol
0 0 0 0 0.0
1 89 132 54 4.9
2 25 0 14 0.7
3 245 138 312 12.4
4 217 57 45 5.9
... ... ... ... ...
188 333 100 3 7.7
189 111 2 1 2.0
190 6 0 0 0.1
191 32 19 4 2.5
192 64 18 4 4.7

193 rows × 4 columns

Select rows and columns#

# row 0, all columns
drinks.loc[0, :]
country                         Afghanistan
beer_servings                             0
spirit_servings                           0
wine_servings                             0
total_litres_of_pure_alcohol            0.0
continent                              Asia
Name: 0, dtype: object
# rows 0 and 1 and 2, all columns
drinks.loc[[0, 1, 2], :]
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
# rows 0 through 2 (inclusive), all columns
drinks.loc[0:2, :]
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
# rows 0 through 2 (inclusive), column 'City'
drinks.loc[0:2, "country"]
0    Afghanistan
1        Albania
2        Algeria
Name: country, dtype: object
# rows 0 through 2 (inclusive), columns 'City' and 'State'
drinks.loc[0:2, ["country", "continent"]]
country continent
0 Afghanistan Asia
1 Albania Europe
2 Algeria Africa
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
drinks.loc[0:2, "country":"continent"]
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
# rows in positions 0 and 1, columns in positions 0 and 3
drinks.iloc[[0, 1], [0, 3]]
country wine_servings
0 Afghanistan 0
1 Albania 54
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
drinks.iloc[0:2, 0:4]
country beer_servings spirit_servings wine_servings
0 Afghanistan 0 0 0
1 Albania 89 132 54
# rows in positions 0 through 2 (exclusive), all columns
drinks.iloc[0:2, :]
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe

Filter#

Column filter#

drinks.filter(items=["beer_servings", "continent"])
beer_servings continent
0 0 Asia
1 89 Europe
2 25 Africa
3 245 Europe
4 217 Africa
... ... ...
188 333 South America
189 111 Asia
190 6 Asia
191 32 Africa
192 64 Africa

193 rows × 2 columns

drinks.filter(regex="servings", axis=1)
beer_servings spirit_servings wine_servings
0 0 0 0
1 89 132 54
2 25 0 14
3 245 138 312
4 217 57 45
... ... ... ...
188 333 100 3
189 111 2 1
190 6 0 0
191 32 19 4
192 64 18 4

193 rows × 3 columns

Index filter#

drinks.filter(like="2", axis=0).head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
2 Algeria 25 0 14 0.7 Africa
12 Bahrain 42 63 7 2.0 Asia
20 Bolivia 167 41 8 3.8 South America
21 Bosnia-Herzegovina 76 173 8 4.6 Europe
22 Botswana 173 35 35 5.4 Africa

Value filter#

drinks.loc[drinks.continent == "South America"]
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
6 Argentina 193 25 221 8.3 South America
20 Bolivia 167 41 8 3.8 South America
23 Brazil 245 145 16 7.2 South America
35 Chile 130 124 172 7.6 South America
37 Colombia 159 76 3 4.2 South America
52 Ecuador 162 74 3 4.2 South America
72 Guyana 93 302 1 7.1 South America
132 Paraguay 213 117 74 7.3 South America
133 Peru 163 160 21 6.1 South America
163 Suriname 128 178 7 5.6 South America
185 Uruguay 115 35 220 6.6 South America
188 Venezuela 333 100 3 7.7 South America
drinks.loc[drinks.continent == "South America", "wine_servings"]
6      221
20       8
23      16
35     172
37       3
52       3
72       1
132     74
133     21
163      7
185    220
188      3
Name: wine_servings, dtype: int64
drinks.query("continent=='South America'")["wine_servings"]
6      221
20       8
23      16
35     172
37       3
52       3
72       1
132     74
133     21
163      7
185    220
188      3
Name: wine_servings, dtype: int64
movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating title content_rating genre duration actors_list
0 9.3 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1 9.2 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.1 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3 9.0 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron E...
4 8.9 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L....
# use the '&' operator to specify that both conditions are required
movies.query("duration >=200 & genre == 'Drama'")
star_rating title content_rating genre duration actors_list
17 8.7 Seven Samurai UNRATED Drama 207 [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K...
157 8.2 Gone with the Wind G Drama 238 [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit...
476 7.8 Hamlet PG-13 Drama 242 [u'Kenneth Branagh', u'Julie Christie', u'Dere...
# using the '|' operator would have shown movies that are either long or dramas (or both)
movies.query("duration >=200 | genre == 'Drama'").head()
star_rating title content_rating genre duration actors_list
2 9.1 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv...
5 8.9 12 Angry Men NOT RATED Drama 96 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...
7 8.9 The Lord of the Rings: The Return of the King PG-13 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...
9 8.9 Fight Club R Drama 139 [u'Brad Pitt', u'Edward Norton', u'Helena Bonh...
13 8.8 Forrest Gump PG-13 Drama 142 [u'Tom Hanks', u'Robin Wright', u'Gary Sinise']

Range filter#

movies.loc[movies["genre"].isin(["Crime", "Drama", "Action"])].head()
star_rating title content_rating genre duration actors_list
0 9.3 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1 9.2 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.1 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3 9.0 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron E...
4 8.9 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L....
movies.loc[movies["duration"].between(100, 200, inclusive="neither")].head()
star_rating title content_rating genre duration actors_list
0 9.3 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1 9.2 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
3 9.0 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron E...
4 8.9 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L....
6 8.9 The Good, the Bad and the Ugly NOT RATED Western 161 [u'Clint Eastwood', u'Eli Wallach', u'Lee Van ...

Reshape#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa
# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks.loc[:, "beer_servings":"wine_servings"].apply(max, axis=0)
beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64
# convert every DataFrame element into a float
drinks.loc[:, "beer_servings":"wine_servings"].applymap(float).head()
/tmp/ipykernel_1822/1461345987.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  drinks.loc[:, "beer_servings":"wine_servings"].applymap(float).head()
beer_servings spirit_servings wine_servings
0 0.0 0.0 0.0
1 89.0 132.0 54.0
2 25.0 0.0 14.0
3 245.0 138.0 312.0
4 217.0 57.0 45.0
# overwrite the existing DataFrame columns
drinks.loc[:, "beer_servings":"wine_servings"] = drinks.loc[
    :, "beer_servings":"wine_servings"
].applymap(float)
drinks.head()
/tmp/ipykernel_1822/3081965367.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  ].applymap(float)
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 Asia
1 Albania 89 132 54 4.9 Europe
2 Algeria 25 0 14 0.7 Africa
3 Andorra 245 138 312 12.4 Europe
4 Angola 217 57 45 5.9 Africa

Concat#

df1 = pd.DataFrame(
    {
        "name": ["A", "B", "C", "D"],
        "math": [60, 89, 82, 70],
        "physics": [66, 95, 83, 66],
        "chemistry": [61, 91, 77, 70],
    }
)

df2 = pd.DataFrame(
    {
        "name": ["E", "F", "G", "H"],
        "math": [66, 95, 83, 66],
        "physics": [60, 89, 82, 70],
        "chemistry": [90, 81, 78, 90],
    }
)
pd.concat([df1, df2])
name math physics chemistry
0 A 60 66 61
1 B 89 95 91
2 C 82 83 77
3 D 70 66 70
0 E 66 60 90
1 F 95 89 81
2 G 83 82 78
3 H 66 70 90
pd.concat([df1, df2], ignore_index=True)
name math physics chemistry
0 A 60 66 61
1 B 89 95 91
2 C 82 83 77
3 D 70 66 70
4 E 66 60 90
5 F 95 89 81
6 G 83 82 78
7 H 66 70 90
pd.concat([df1, df2], axis=1)
name math physics chemistry name math physics chemistry
0 A 60 66 61 E 66 60 90
1 B 89 95 91 F 95 89 81
2 C 82 83 77 G 83 82 78
3 D 70 66 70 H 66 70 90
pd.concat([df1, df2], keys=["Year 1", "Year 2"], names=["Class", None]).reset_index(
    level=0
)
Class name math physics chemistry
0 Year 1 A 60 66 61
1 Year 1 B 89 95 91
2 Year 1 C 82 83 77
3 Year 1 D 70 66 70
0 Year 2 E 66 60 90
1 Year 2 F 95 89 81
2 Year 2 G 83 82 78
3 Year 2 H 66 70 90
pd.concat([df1, df2], sort=True)
chemistry math name physics
0 61 60 A 66
1 91 89 B 95
2 77 82 C 83
3 70 70 D 66
0 90 66 E 60
1 81 95 F 89
2 78 83 G 82
3 90 66 H 70

Merge#

movie_cols = ["movie_id", "title"]
movies = pd.read_csv(
    "data/u.item", sep="|", header=None, names=movie_cols, usecols=[0, 1]
)
movies.head()
movie_id title
0 1 Toy Story (1995)
1 2 GoldenEye (1995)
2 3 Four Rooms (1995)
3 4 Get Shorty (1995)
4 5 Copycat (1995)
movies.shape
(1682, 2)
rating_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv("data/u.data", sep="\t", header=None, names=rating_cols)
ratings.head()
user_id movie_id rating timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596
ratings.shape
(100000, 4)
ratings.query("movie_id == 1").head()
user_id movie_id rating timestamp
24 308 1 4 887736532
454 287 1 5 875334088
957 148 1 4 877019411
971 280 1 4 891700426
1324 66 1 3 883601324
movie_ratings = movies.merge(ratings)
movie_ratings.columns
Index(['movie_id', 'title', 'user_id', 'rating', 'timestamp'], dtype='object')
movie_ratings.head()
movie_id title user_id rating timestamp
0 1 Toy Story (1995) 308 4 887736532
1 1 Toy Story (1995) 287 5 875334088
2 1 Toy Story (1995) 148 4 877019411
3 1 Toy Story (1995) 280 4 891700426
4 1 Toy Story (1995) 66 3 883601324
print(movies.shape)
print(ratings.shape)
print(movie_ratings.shape)
(1682, 2)
(100000, 4)
(100000, 5)
movies.columns = ["m_id", "title"]
movies.columns
Index(['m_id', 'title'], dtype='object')
ratings.columns
Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')
movies.merge(ratings, left_on="m_id", right_on="movie_id").head()
m_id title user_id movie_id rating timestamp
0 1 Toy Story (1995) 308 1 4 887736532
1 1 Toy Story (1995) 287 1 5 875334088
2 1 Toy Story (1995) 148 1 4 877019411
3 1 Toy Story (1995) 280 1 4 891700426
4 1 Toy Story (1995) 66 1 3 883601324
movies.merge(ratings, left_index=True, right_on="movie_id").head()
m_id title user_id movie_id rating timestamp
24 2 GoldenEye (1995) 308 1 4 887736532
454 2 GoldenEye (1995) 287 1 5 875334088
957 2 GoldenEye (1995) 148 1 4 877019411
971 2 GoldenEye (1995) 280 1 4 891700426
1324 2 GoldenEye (1995) 66 1 3 883601324
movies.merge(ratings, left_index=True, right_index=True).head()
m_id title user_id movie_id rating timestamp
0 1 Toy Story (1995) 196 242 3 881250949
1 2 GoldenEye (1995) 186 302 3 891717742
2 3 Four Rooms (1995) 22 377 1 878887116
3 4 Get Shorty (1995) 244 51 2 880606923
4 5 Copycat (1995) 166 346 1 886397596

Merge with MultiIndexes#

close = pd.read_csv(
    "data/stocks.csv", usecols=[0, 1, 3], index_col=["Symbol", "Date"]
).sort_index()
close
Close
Symbol Date
AAPL 2016-10-03 112.52
2016-10-04 113.00
2016-10-05 113.05
CSCO 2016-10-03 31.50
2016-10-04 31.35
2016-10-05 31.59
MSFT 2016-10-03 57.42
2016-10-04 57.24
2016-10-05 57.64
volume = pd.read_csv(
    "data/stocks.csv", usecols=[0, 2, 3], index_col=["Symbol", "Date"]
).sort_index()
volume
Volume
Symbol Date
AAPL 2016-10-03 21701800
2016-10-04 29736800
2016-10-05 21453100
CSCO 2016-10-03 14070500
2016-10-04 18460400
2016-10-05 11808600
MSFT 2016-10-03 19189500
2016-10-04 20085900
2016-10-05 16726400
both = close.merge(volume, left_index=True, right_index=True)
both
Close Volume
Symbol Date
AAPL 2016-10-03 112.52 21701800
2016-10-04 113.00 29736800
2016-10-05 113.05 21453100
CSCO 2016-10-03 31.50 14070500
2016-10-04 31.35 18460400
2016-10-05 31.59 11808600
MSFT 2016-10-03 57.42 19189500
2016-10-04 57.24 20085900
2016-10-05 57.64 16726400
both.reset_index()
Symbol Date Close Volume
0 AAPL 2016-10-03 112.52 21701800
1 AAPL 2016-10-04 113.00 29736800
2 AAPL 2016-10-05 113.05 21453100
3 CSCO 2016-10-03 31.50 14070500
4 CSCO 2016-10-04 31.35 18460400
5 CSCO 2016-10-05 31.59 11808600
6 MSFT 2016-10-03 57.42 19189500
7 MSFT 2016-10-04 57.24 20085900
8 MSFT 2016-10-05 57.64 16726400

Join#

A = pd.DataFrame({"color": ["green", "yellow", "red"], "num": [1, 2, 3]})
A
color num
0 green 1
1 yellow 2
2 red 3
B = pd.DataFrame({"color": ["green", "yellow", "pink"], "size": ["S", "M", "L"]})
B
color size
0 green S
1 yellow M
2 pink L

Inner join: Only include observations found in both A and B:

A.merge(B, how="inner")
color num size
0 green 1 S
1 yellow 2 M

Outer join: Include observations found in either A or B:

A.merge(B, how="outer")
color num size
0 green 1.0 S
1 pink NaN L
2 red 3.0 NaN
3 yellow 2.0 M

Left join: Include all observations found in A:

A.merge(B, how="left")
color num size
0 green 1 S
1 yellow 2 M
2 red 3 NaN

Right join: Include all observations found in B:

A.merge(B, how="right")
color num size
0 green 1.0 S
1 yellow 2.0 M
2 pink NaN L

Explode#

orders = pd.read_csv("data/chipotle.tsv", sep="\t")
orders.head()
order_id quantity item_name choice_description item_price
0 1 1 Chips and Fresh Tomato Salsa NaN $2.39
1 1 1 Izze [Clementine] $3.39
2 1 1 Nantucket Nectar [Apple] $3.39
3 1 1 Chips and Tomatillo-Green Chili Salsa NaN $2.39
4 2 2 Chicken Bowl [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
orders["item_name"] = orders["item_name"].str.split()
orders.head()
order_id quantity item_name choice_description item_price
0 1 1 [Chips, and, Fresh, Tomato, Salsa] NaN $2.39
1 1 1 [Izze] [Clementine] $3.39
2 1 1 [Nantucket, Nectar] [Apple] $3.39
3 1 1 [Chips, and, Tomatillo-Green, Chili, Salsa] NaN $2.39
4 2 2 [Chicken, Bowl] [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98
orders.explode("item_name")
order_id quantity item_name choice_description item_price
0 1 1 Chips NaN $2.39
0 1 1 and NaN $2.39
0 1 1 Fresh NaN $2.39
0 1 1 Tomato NaN $2.39
0 1 1 Salsa NaN $2.39
... ... ... ... ... ...
4620 1834 1 Salad [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75
4620 1834 1 Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Lettu... $8.75
4621 1834 1 Chicken [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75
4621 1834 1 Salad [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75
4621 1834 1 Bowl [Fresh Tomato Salsa, [Fajita Vegetables, Pinto... $8.75

11501 rows × 5 columns

Display#

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()
star_rating title content_rating genre duration actors_list
0 9.3 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1 9.2 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.1 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3 9.0 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron E...
4 8.9 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L....
movies[:10].style.bar("duration", vmin=60)
  star_rating title content_rating genre duration actors_list
0 9.300000 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1 9.200000 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.100000 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duvall']
3 9.000000 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart']
4 8.900000 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson']
5 8.900000 12 Angry Men NOT RATED Drama 96 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam']
6 8.900000 The Good, the Bad and the Ugly NOT RATED Western 161 [u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef']
7 8.900000 The Lord of the Rings: The Return of the King PG-13 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen']
8 8.900000 Schindler's List R Biography 195 [u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley']
9 8.900000 Fight Club R Drama 139 [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter']
movies[:10].style.background_gradient("Greens", subset="star_rating")
  star_rating title content_rating genre duration actors_list
0 9.300000 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1 9.200000 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.100000 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duvall']
3 9.000000 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart']
4 8.900000 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson']
5 8.900000 12 Angry Men NOT RATED Drama 96 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam']
6 8.900000 The Good, the Bad and the Ugly NOT RATED Western 161 [u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef']
7 8.900000 The Lord of the Rings: The Return of the King PG-13 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen']
8 8.900000 Schindler's List R Biography 195 [u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley']
9 8.900000 Fight Club R Drama 139 [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter']
(
    movies[:10]
    .style.bar("duration", vmin=60)
    .background_gradient("Greens", subset="star_rating")
    .highlight_null()
    .highlight_max()
)
  star_rating title content_rating genre duration actors_list
0 9.300000 The Shawshank Redemption R Crime 142 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1 9.200000 The Godfather R Crime 175 [u'Marlon Brando', u'Al Pacino', u'James Caan']
2 9.100000 The Godfather: Part II R Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duvall']
3 9.000000 The Dark Knight PG-13 Action 152 [u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart']
4 8.900000 Pulp Fiction R Crime 154 [u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson']
5 8.900000 12 Angry Men NOT RATED Drama 96 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam']
6 8.900000 The Good, the Bad and the Ugly NOT RATED Western 161 [u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef']
7 8.900000 The Lord of the Rings: The Return of the King PG-13 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen']
8 8.900000 Schindler's List R Biography 195 [u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley']
9 8.900000 Fight Club R Drama 139 [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter']