读写和过滤

读写和过滤#

import pandas as pd

Read & Info#

Read tabular data#

user_cols = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv("data/movie.user", sep="|", header=None, names=user_cols)

# examine the first 5 rows
users.head()

	user_id	age	gender	occupation	zip_code
0	1	24	M	technician	85711
1	2	53	F	other	94043
2	3	23	M	writer	32067
3	4	24	M	technician	43537
4	5	33	F	other	15213

# read_csv is equivalent to read_csv, except it assumes a comma separator
ufo = pd.read_csv("data//ufo.csv")

# examine the first 5 rows
ufo.head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	NaN	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	NaN	OTHER	NJ	6/30/1930 20:00
2	Holyoke	NaN	OVAL	CO	2/15/1931 14:00
3	Abilene	NaN	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	NaN	LIGHT	NY	4/18/1933 19:00

# specify which columns to include by name
pd.read_csv("data/ufo.csv", usecols=["City", "State"]).head()

	City	State
0	Ithaca	NY
1	Willingboro	NJ
2	Holyoke	CO
3	Abilene	KS
4	New York Worlds Fair	NY

# specify columns by position
pd.read_csv("data/ufo.csv", usecols=[0, 4]).head()

	City	Time
0	Ithaca	6/1/1930 22:00
1	Willingboro	6/30/1930 20:00
2	Holyoke	2/15/1931 14:00
3	Abilene	6/1/1931 13:00
4	New York Worlds Fair	4/18/1933 19:00

# specify how many rows to read
pd.read_csv("data/ufo.csv", nrows=3).head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	NaN	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	NaN	OTHER	NJ	6/30/1930 20:00
2	Holyoke	NaN	OVAL	CO	2/15/1931 14:00

Describe data#

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()

	star_rating	title	content_rating	genre	duration	actors_list
0	9.3	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1	9.2	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.1	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3	9.0	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron E...
4	8.9	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L....

# example attribute: number of rows and columns
movies.shape

(979, 6)

# example attribute: data type of each column
movies.dtypes

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

# example method: calculate summary statistics
movies.describe()

	star_rating	duration
count	979.000000	979.000000
mean	7.889785	120.979571
std	0.336069	26.218010
min	7.400000	64.000000
25%	7.600000	102.000000
50%	7.800000	117.000000
75%	8.100000	134.000000
max	9.300000	242.000000

# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=["object"])

	title	content_rating	genre	actors_list
count	979	976	979	979
unique	975	12	16	969
top	Dracula	R	Drama	[u'Daniel Radcliffe', u'Emma Watson', u'Rupert...
freq	2	460	278	6

Column Manipulation#

Create a new column#

# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo["Location"] = f"{ufo.City}, {ufo.State}"
ufo.head()

	City	Colors Reported	Shape Reported	State	Time	Location
0	Ithaca	NaN	TRIANGLE	NY	6/1/1930 22:00	0 Ithaca\n1 ...
1	Willingboro	NaN	OTHER	NJ	6/30/1930 20:00	0 Ithaca\n1 ...
2	Holyoke	NaN	OVAL	CO	2/15/1931 14:00	0 Ithaca\n1 ...
3	Abilene	NaN	DISK	KS	6/1/1931 13:00	0 Ithaca\n1 ...
4	New York Worlds Fair	NaN	LIGHT	NY	4/18/1933 19:00	0 Ithaca\n1 ...

Rename columns#

ufo = pd.read_csv("data/ufo.csv")

# examine the column names
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

# rename two of the columns by using the 'rename' method
ufo = ufo.rename(
    columns={"Colors Reported": "Colors_Reported", "Shape Reported": "Shape_Reported"}
)
ufo.columns

Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')

# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ["city", "colors reported", "shape reported", "state", "time"]
ufo.columns = ufo_cols
ufo.columns

Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')

# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv("data/ufo.csv", header=0, names=ufo_cols)
ufo.columns

Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')

# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(" ", "_")
ufo.columns

Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')

Remove columns#

ufo = pd.read_csv("data/ufo.csv")
ufo.head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	NaN	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	NaN	OTHER	NJ	6/30/1930 20:00
2	Holyoke	NaN	OVAL	CO	2/15/1931 14:00
3	Abilene	NaN	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	NaN	LIGHT	NY	4/18/1933 19:00

# remove a single column (axis=1 refers to columns)
ufo = ufo.drop("Colors Reported", axis=1)
ufo.head()

	City	Shape Reported	State	Time
0	Ithaca	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	OTHER	NJ	6/30/1930 20:00
2	Holyoke	OVAL	CO	2/15/1931 14:00
3	Abilene	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	LIGHT	NY	4/18/1933 19:00

# remove multiple columns at once
ufo.drop(columns=["City", "State"]).head()

	Shape Reported	Time
0	TRIANGLE	6/1/1930 22:00
1	OTHER	6/30/1930 20:00
2	OVAL	2/15/1931 14:00
3	DISK	6/1/1931 13:00
4	LIGHT	4/18/1933 19:00

Remove rows#

# new way to drop rows: specify index
ufo.drop(index=[0, 1]).head()

	City	Shape Reported	State	Time
2	Holyoke	OVAL	CO	2/15/1931 14:00
3	Abilene	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	LIGHT	NY	4/18/1933 19:00
5	Valley City	DISK	ND	9/15/1934 15:30
6	Crater Lake	CIRCLE	CA	6/15/1935 0:00

Select#

Iterate#

ufo = pd.read_csv("data/ufo.csv")
ufo.head()

	City	Colors Reported	Shape Reported	State	Time
0	Ithaca	NaN	TRIANGLE	NY	6/1/1930 22:00
1	Willingboro	NaN	OTHER	NJ	6/30/1930 20:00
2	Holyoke	NaN	OVAL	CO	2/15/1931 14:00
3	Abilene	NaN	DISK	KS	6/1/1931 13:00
4	New York Worlds Fair	NaN	LIGHT	NY	4/18/1933 19:00

# various methods are available to iterate through a DataFrame
for index, row in ufo[:10].iterrows():
    print(index, row.City, row.State)

Ithaca NY
Willingboro NJ
Holyoke CO
Abilene KS
New York Worlds Fair NY
Valley City ND
Crater Lake CA
Alma MI
Eklutna AK
Hubbard OR

Index#

# index and columns both default to integers if you don't define them
pd.read_csv("data/drinks.csv", header=None).head()

	0	1	2	3	4	5
0	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
1	Afghanistan	0	0	0	0.0	Asia
2	Albania	89	132	54	4.9	Europe
3	Algeria	25	0	14	0.7	Africa
4	Andorra	245	138	312	12.4	Europe

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

# every DataFrame has an index (sometimes called the "row labels")
drinks.index

RangeIndex(start=0, stop=193, step=1)

# set an existing column as the index
drinks = drinks.set_index("country")
drinks.head()
# 'country' is now the index
drinks.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',
       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='country', length=193)

# restore the index name, and move the index back to a column
drinks.index.name = "country"
drinks = drinks.reset_index()
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

Select a column#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

# select the 'City' Series using bracket notation
drinks["country"]
# or equivalently, use dot notation
drinks.country

    Afghanistan
        Albania
        Algeria
        Andorra
         Angola
          ...     
    Venezuela
      Vietnam
        Yemen
       Zambia
     Zimbabwe
Name: country, Length: 193, dtype: object

drink = "wine"
drinks[f"{drink}_servings"]

      0
     54
     14
    312
     45
      ... 
    3
    1
    0
    4
    4
Name: wine_servings, Length: 193, dtype: int64

Select numeric columns#

drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

import numpy as np

drinks.select_dtypes(include=[np.number]).dtypes

beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
dtype: object

drinks.select_dtypes(include=[np.number]).squeeze("columns")

	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
0	0	0	0	0.0
1	89	132	54	4.9
2	25	0	14	0.7
3	245	138	312	12.4
4	217	57	45	5.9
...	...	...	...	...
188	333	100	3	7.7
189	111	2	1	2.0
190	6	0	0	0.1
191	32	19	4	2.5
192	64	18	4	4.7

193 rows × 4 columns

Select rows and columns#

# row 0, all columns
drinks.loc[0, :]

country                         Afghanistan
beer_servings                             0
spirit_servings                           0
wine_servings                             0
total_litres_of_pure_alcohol            0.0
continent                              Asia
Name: 0, dtype: object

# rows 0 and 1 and 2, all columns
drinks.loc[[0, 1, 2], :]

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa

# rows 0 through 2 (inclusive), all columns
drinks.loc[0:2, :]

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa

# rows 0 through 2 (inclusive), column 'City'
drinks.loc[0:2, "country"]

  Afghanistan
      Albania
      Algeria
Name: country, dtype: object

# rows 0 through 2 (inclusive), columns 'City' and 'State'
drinks.loc[0:2, ["country", "continent"]]

	country	continent
0	Afghanistan	Asia
1	Albania	Europe
2	Algeria	Africa

# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
drinks.loc[0:2, "country":"continent"]

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa

# rows in positions 0 and 1, columns in positions 0 and 3
drinks.iloc[[0, 1], [0, 3]]

	country	wine_servings
0	Afghanistan	0
1	Albania	54

# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
drinks.iloc[0:2, 0:4]

	country	beer_servings	spirit_servings	wine_servings
0	Afghanistan	0	0	0
1	Albania	89	132	54

# rows in positions 0 through 2 (exclusive), all columns
drinks.iloc[0:2, :]

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe

Filter#

Column filter#

drinks.filter(items=["beer_servings", "continent"])

	beer_servings	continent
0	0	Asia
1	89	Europe
2	25	Africa
3	245	Europe
4	217	Africa
...	...	...
188	333	South America
189	111	Asia
190	6	Asia
191	32	Africa
192	64	Africa

193 rows × 2 columns

drinks.filter(regex="servings", axis=1)

	beer_servings	spirit_servings	wine_servings
0	0	0	0
1	89	132	54
2	25	0	14
3	245	138	312
4	217	57	45
...	...	...	...
188	333	100	3
189	111	2	1
190	6	0	0
191	32	19	4
192	64	18	4

193 rows × 3 columns

Index filter#

drinks.filter(like="2", axis=0).head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
2	Algeria	25	0	14	0.7	Africa
12	Bahrain	42	63	7	2.0	Asia
20	Bolivia	167	41	8	3.8	South America
21	Bosnia-Herzegovina	76	173	8	4.6	Europe
22	Botswana	173	35	35	5.4	Africa

Value filter#

drinks.loc[drinks.continent == "South America"]

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
6	Argentina	193	25	221	8.3	South America
20	Bolivia	167	41	8	3.8	South America
23	Brazil	245	145	16	7.2	South America
35	Chile	130	124	172	7.6	South America
37	Colombia	159	76	3	4.2	South America
52	Ecuador	162	74	3	4.2	South America
72	Guyana	93	302	1	7.1	South America
132	Paraguay	213	117	74	7.3	South America
133	Peru	163	160	21	6.1	South America
163	Suriname	128	178	7	5.6	South America
185	Uruguay	115	35	220	6.6	South America
188	Venezuela	333	100	3	7.7	South America

drinks.loc[drinks.continent == "South America", "wine_servings"]

    221
     8
    16
   172
     3
     3
     1
   74
   21
    7
  220
    3
Name: wine_servings, dtype: int64

drinks.query("continent=='South America'")["wine_servings"]

    221
     8
    16
   172
     3
     3
     1
   74
   21
    7
  220
    3
Name: wine_servings, dtype: int64

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()

	star_rating	title	content_rating	genre	duration	actors_list
0	9.3	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1	9.2	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.1	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3	9.0	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron E...
4	8.9	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L....

# use the '&' operator to specify that both conditions are required
movies.query("duration >=200 & genre == 'Drama'")

	star_rating	title	content_rating	genre	duration	actors_list
17	8.7	Seven Samurai	UNRATED	Drama	207	[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K...
157	8.2	Gone with the Wind	G	Drama	238	[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit...
476	7.8	Hamlet	PG-13	Drama	242	[u'Kenneth Branagh', u'Julie Christie', u'Dere...

# using the '|' operator would have shown movies that are either long or dramas (or both)
movies.query("duration >=200 | genre == 'Drama'").head()

	star_rating	title	content_rating	genre	duration	actors_list
2	9.1	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
5	8.9	12 Angry Men	NOT RATED	Drama	96	[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...
7	8.9	The Lord of the Rings: The Return of the King	PG-13	Adventure	201	[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...
9	8.9	Fight Club	R	Drama	139	[u'Brad Pitt', u'Edward Norton', u'Helena Bonh...
13	8.8	Forrest Gump	PG-13	Drama	142	[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']

Range filter#

movies.loc[movies["genre"].isin(["Crime", "Drama", "Action"])].head()

	star_rating	title	content_rating	genre	duration	actors_list
0	9.3	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1	9.2	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.1	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3	9.0	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron E...
4	8.9	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L....

movies.loc[movies["duration"].between(100, 200, inclusive="neither")].head()

	star_rating	title	content_rating	genre	duration	actors_list
0	9.3	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1	9.2	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
3	9.0	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron E...
4	8.9	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L....
6	8.9	The Good, the Bad and the Ugly	NOT RATED	Western	161	[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ...

Reshape#

drinks = pd.read_csv("data/drinks.csv")
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

# apply the 'max' function along axis 0 to calculate the maximum value in each column
drinks.loc[:, "beer_servings":"wine_servings"].apply(max, axis=0)

beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64

# convert every DataFrame element into a float
drinks.loc[:, "beer_servings":"wine_servings"].applymap(float).head()

/tmp/ipykernel_1822/1461345987.py:2: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  drinks.loc[:, "beer_servings":"wine_servings"].applymap(float).head()

	beer_servings	spirit_servings	wine_servings
0	0.0	0.0	0.0
1	89.0	132.0	54.0
2	25.0	0.0	14.0
3	245.0	138.0	312.0
4	217.0	57.0	45.0

# overwrite the existing DataFrame columns
drinks.loc[:, "beer_servings":"wine_servings"] = drinks.loc[
    :, "beer_servings":"wine_servings"
].applymap(float)
drinks.head()

/tmp/ipykernel_1822/3081965367.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  ].applymap(float)

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	Asia
1	Albania	89	132	54	4.9	Europe
2	Algeria	25	0	14	0.7	Africa
3	Andorra	245	138	312	12.4	Europe
4	Angola	217	57	45	5.9	Africa

Concat#

df1 = pd.DataFrame(
    {
        "name": ["A", "B", "C", "D"],
        "math": [60, 89, 82, 70],
        "physics": [66, 95, 83, 66],
        "chemistry": [61, 91, 77, 70],
    }
)

df2 = pd.DataFrame(
    {
        "name": ["E", "F", "G", "H"],
        "math": [66, 95, 83, 66],
        "physics": [60, 89, 82, 70],
        "chemistry": [90, 81, 78, 90],
    }
)

pd.concat([df1, df2])

	name	math	physics	chemistry
0	A	60	66	61
1	B	89	95	91
2	C	82	83	77
3	D	70	66	70
0	E	66	60	90
1	F	95	89	81
2	G	83	82	78
3	H	66	70	90

pd.concat([df1, df2], ignore_index=True)

	name	math	physics	chemistry
0	A	60	66	61
1	B	89	95	91
2	C	82	83	77
3	D	70	66	70
4	E	66	60	90
5	F	95	89	81
6	G	83	82	78
7	H	66	70	90

pd.concat([df1, df2], axis=1)

	name	math	physics	chemistry	name	math	physics	chemistry
0	A	60	66	61	E	66	60	90
1	B	89	95	91	F	95	89	81
2	C	82	83	77	G	83	82	78
3	D	70	66	70	H	66	70	90

pd.concat([df1, df2], keys=["Year 1", "Year 2"], names=["Class", None]).reset_index(
    level=0
)

	Class	name	math	physics	chemistry
0	Year 1	A	60	66	61
1	Year 1	B	89	95	91
2	Year 1	C	82	83	77
3	Year 1	D	70	66	70
0	Year 2	E	66	60	90
1	Year 2	F	95	89	81
2	Year 2	G	83	82	78
3	Year 2	H	66	70	90

pd.concat([df1, df2], sort=True)

	chemistry	math	name	physics
0	61	60	A	66
1	91	89	B	95
2	77	82	C	83
3	70	70	D	66
0	90	66	E	60
1	81	95	F	89
2	78	83	G	82
3	90	66	H	70

Merge#

movie_cols = ["movie_id", "title"]
movies = pd.read_csv(
    "data/u.item", sep="|", header=None, names=movie_cols, usecols=[0, 1]
)
movies.head()

	movie_id	title
0	1	Toy Story (1995)
1	2	GoldenEye (1995)
2	3	Four Rooms (1995)
3	4	Get Shorty (1995)
4	5	Copycat (1995)

movies.shape

(1682, 2)

rating_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv("data/u.data", sep="\t", header=None, names=rating_cols)
ratings.head()

	user_id	movie_id	rating	timestamp
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596

ratings.shape

(100000, 4)

ratings.query("movie_id == 1").head()

	user_id	movie_id	rating	timestamp
24	308	1	4	887736532
454	287	1	5	875334088
957	148	1	4	877019411
971	280	1	4	891700426
1324	66	1	3	883601324

movie_ratings = movies.merge(ratings)
movie_ratings.columns

Index(['movie_id', 'title', 'user_id', 'rating', 'timestamp'], dtype='object')

movie_ratings.head()

	movie_id	title	user_id	rating	timestamp
0	1	Toy Story (1995)	308	4	887736532
1	1	Toy Story (1995)	287	5	875334088
2	1	Toy Story (1995)	148	4	877019411
3	1	Toy Story (1995)	280	4	891700426
4	1	Toy Story (1995)	66	3	883601324

print(movies.shape)
print(ratings.shape)
print(movie_ratings.shape)

(1682, 2)
(100000, 4)
(100000, 5)

movies.columns = ["m_id", "title"]
movies.columns

Index(['m_id', 'title'], dtype='object')

ratings.columns

Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')

movies.merge(ratings, left_on="m_id", right_on="movie_id").head()

	m_id	title	user_id	movie_id	rating	timestamp
0	1	Toy Story (1995)	308	1	4	887736532
1	1	Toy Story (1995)	287	1	5	875334088
2	1	Toy Story (1995)	148	1	4	877019411
3	1	Toy Story (1995)	280	1	4	891700426
4	1	Toy Story (1995)	66	1	3	883601324

movies.merge(ratings, left_index=True, right_on="movie_id").head()

	m_id	title	user_id	movie_id	rating	timestamp
24	2	GoldenEye (1995)	308	1	4	887736532
454	2	GoldenEye (1995)	287	1	5	875334088
957	2	GoldenEye (1995)	148	1	4	877019411
971	2	GoldenEye (1995)	280	1	4	891700426
1324	2	GoldenEye (1995)	66	1	3	883601324

movies.merge(ratings, left_index=True, right_index=True).head()

	m_id	title	user_id	movie_id	rating	timestamp
0	1	Toy Story (1995)	196	242	3	881250949
1	2	GoldenEye (1995)	186	302	3	891717742
2	3	Four Rooms (1995)	22	377	1	878887116
3	4	Get Shorty (1995)	244	51	2	880606923
4	5	Copycat (1995)	166	346	1	886397596

Merge with MultiIndexes#

close = pd.read_csv(
    "data/stocks.csv", usecols=[0, 1, 3], index_col=["Symbol", "Date"]
).sort_index()
close

		Close
Symbol	Date
AAPL	2016-10-03	112.52
	2016-10-04	113.00
	2016-10-05	113.05
CSCO	2016-10-03	31.50
	2016-10-04	31.35
	2016-10-05	31.59
MSFT	2016-10-03	57.42
	2016-10-04	57.24
	2016-10-05	57.64

volume = pd.read_csv(
    "data/stocks.csv", usecols=[0, 2, 3], index_col=["Symbol", "Date"]
).sort_index()
volume

		Volume
Symbol	Date
AAPL	2016-10-03	21701800
	2016-10-04	29736800
	2016-10-05	21453100
CSCO	2016-10-03	14070500
	2016-10-04	18460400
	2016-10-05	11808600
MSFT	2016-10-03	19189500
	2016-10-04	20085900
	2016-10-05	16726400

both = close.merge(volume, left_index=True, right_index=True)
both

		Close	Volume
Symbol	Date
AAPL	2016-10-03	112.52	21701800
	2016-10-04	113.00	29736800
	2016-10-05	113.05	21453100
CSCO	2016-10-03	31.50	14070500
	2016-10-04	31.35	18460400
	2016-10-05	31.59	11808600
MSFT	2016-10-03	57.42	19189500
	2016-10-04	57.24	20085900
	2016-10-05	57.64	16726400

both.reset_index()

	Symbol	Date	Close	Volume
0	AAPL	2016-10-03	112.52	21701800
1	AAPL	2016-10-04	113.00	29736800
2	AAPL	2016-10-05	113.05	21453100
3	CSCO	2016-10-03	31.50	14070500
4	CSCO	2016-10-04	31.35	18460400
5	CSCO	2016-10-05	31.59	11808600
6	MSFT	2016-10-03	57.42	19189500
7	MSFT	2016-10-04	57.24	20085900
8	MSFT	2016-10-05	57.64	16726400

Join#

A = pd.DataFrame({"color": ["green", "yellow", "red"], "num": [1, 2, 3]})
A

	color	num
0	green	1
1	yellow	2
2	red	3

B = pd.DataFrame({"color": ["green", "yellow", "pink"], "size": ["S", "M", "L"]})
B

	color	size
0	green	S
1	yellow	M
2	pink	L

Inner join: Only include observations found in both A and B:

A.merge(B, how="inner")

	color	num	size
0	green	1	S
1	yellow	2	M

Outer join: Include observations found in either A or B:

A.merge(B, how="outer")

	color	num	size
0	green	1.0	S
1	pink	NaN	L
2	red	3.0	NaN
3	yellow	2.0	M

Left join: Include all observations found in A:

A.merge(B, how="left")

	color	num	size
0	green	1	S
1	yellow	2	M
2	red	3	NaN

Right join: Include all observations found in B:

A.merge(B, how="right")

	color	num	size
0	green	1.0	S
1	yellow	2.0	M
2	pink	NaN	L

Explode#

orders = pd.read_csv("data/chipotle.tsv", sep="\t")
orders.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98

orders["item_name"] = orders["item_name"].str.split()
orders.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	[Chips, and, Fresh, Tomato, Salsa]	NaN	$2.39
1	1	1	[Izze]	[Clementine]	$3.39
2	1	1	[Nantucket, Nectar]	[Apple]	$3.39
3	1	1	[Chips, and, Tomatillo-Green, Chili, Salsa]	NaN	$2.39
4	2	2	[Chicken, Bowl]	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98

orders.explode("item_name")

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips	NaN	$2.39
0	1	1	and	NaN	$2.39
0	1	1	Fresh	NaN	$2.39
0	1	1	Tomato	NaN	$2.39
0	1	1	Salsa	NaN	$2.39
...	...	...	...	...	...
4620	1834	1	Salad	[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...	$8.75
4620	1834	1	Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...	$8.75
4621	1834	1	Chicken	[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...	$8.75
4621	1834	1	Salad	[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...	$8.75
4621	1834	1	Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...	$8.75

11501 rows × 5 columns

Display#

movies = pd.read_csv("data/imdb_1000.csv")
movies.head()

	star_rating	title	content_rating	genre	duration	actors_list
0	9.3	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
1	9.2	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.1	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duv...
3	9.0	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron E...
4	8.9	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L....

movies[:10].style.bar("duration", vmin=60)

	star_rating	title	content_rating	genre	duration	actors_list
0	9.300000	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1	9.200000	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.100000	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duvall']
3	9.000000	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart']
4	8.900000	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson']
5	8.900000	12 Angry Men	NOT RATED	Drama	96	[u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam']
6	8.900000	The Good, the Bad and the Ugly	NOT RATED	Western	161	[u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef']
7	8.900000	The Lord of the Rings: The Return of the King	PG-13	Adventure	201	[u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen']
8	8.900000	Schindler's List	R	Biography	195	[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley']
9	8.900000	Fight Club	R	Drama	139	[u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter']

movies[:10].style.background_gradient("Greens", subset="star_rating")

	star_rating	title	content_rating	genre	duration	actors_list
0	9.300000	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1	9.200000	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.100000	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duvall']
3	9.000000	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart']
4	8.900000	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson']
5	8.900000	12 Angry Men	NOT RATED	Drama	96	[u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam']
6	8.900000	The Good, the Bad and the Ugly	NOT RATED	Western	161	[u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef']
7	8.900000	The Lord of the Rings: The Return of the King	PG-13	Adventure	201	[u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen']
8	8.900000	Schindler's List	R	Biography	195	[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley']
9	8.900000	Fight Club	R	Drama	139	[u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter']

(
    movies[:10]
    .style.bar("duration", vmin=60)
    .background_gradient("Greens", subset="star_rating")
    .highlight_null()
    .highlight_max()
)

	star_rating	title	content_rating	genre	duration	actors_list
0	9.300000	The Shawshank Redemption	R	Crime	142	[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']
1	9.200000	The Godfather	R	Crime	175	[u'Marlon Brando', u'Al Pacino', u'James Caan']
2	9.100000	The Godfather: Part II	R	Crime	200	[u'Al Pacino', u'Robert De Niro', u'Robert Duvall']
3	9.000000	The Dark Knight	PG-13	Action	152	[u'Christian Bale', u'Heath Ledger', u'Aaron Eckhart']
4	8.900000	Pulp Fiction	R	Crime	154	[u'John Travolta', u'Uma Thurman', u'Samuel L. Jackson']
5	8.900000	12 Angry Men	NOT RATED	Drama	96	[u'Henry Fonda', u'Lee J. Cobb', u'Martin Balsam']
6	8.900000	The Good, the Bad and the Ugly	NOT RATED	Western	161	[u'Clint Eastwood', u'Eli Wallach', u'Lee Van Cleef']
7	8.900000	The Lord of the Rings: The Return of the King	PG-13	Adventure	201	[u'Elijah Wood', u'Viggo Mortensen', u'Ian McKellen']
8	8.900000	Schindler's List	R	Biography	195	[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kingsley']
9	8.900000	Fight Club	R	Drama	139	[u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter']

	beer_servings	spirit_servings	wine_servings
0	0	0	0
1	89	132	54
2	25	0	14
3	245	138	312
4	217	57	45
...	...	...	...
188	333	100	3
189	111	2	1
190	6	0	0
191	32	19	4
192	64	18	4

	name	math	physics	chemistry
0	A	60	66	61
1	B	89	95	91
2	C	82	83	77
3	D	70	66	70
0	E	66	60	90
1	F	95	89	81
2	G	83	82	78
3	H	66	70	90

	name	math	physics	chemistry
0	A	60	66	61
1	B	89	95	91
2	C	82	83	77
3	D	70	66	70
4	E	66	60	90
5	F	95	89	81
6	G	83	82	78
7	H	66	70	90

	name	math	physics	chemistry	name	math	physics	chemistry
0	A	60	66	61	E	66	60	90
1	B	89	95	91	F	95	89	81
2	C	82	83	77	G	83	82	78
3	D	70	66	70	H	66	70	90

	chemistry	math	name	physics
0	61	60	A	66
1	91	89	B	95
2	77	82	C	83
3	70	70	D	66
0	90	66	E	60
1	81	95	F	89
2	78	83	G	82
3	90	66	H	70

	beer_servings	spirit_servings	wine_servings
0	0	0	0
1	89	132	54
2	25	0	14
3	245	138	312
4	217	57	45
...	...	...	...
188	333	100	3
189	111	2	1
190	6	0	0
191	32	19	4
192	64	18	4

	name	math	physics	chemistry
0	A	60	66	61
1	B	89	95	91
2	C	82	83	77
3	D	70	66	70
0	E	66	60	90
1	F	95	89	81
2	G	83	82	78
3	H	66	70	90

	name	math	physics	chemistry
0	A	60	66	61
1	B	89	95	91
2	C	82	83	77
3	D	70	66	70
4	E	66	60	90
5	F	95	89	81
6	G	83	82	78
7	H	66	70	90

	name	math	physics	chemistry	name	math	physics	chemistry
0	A	60	66	61	E	66	60	90
1	B	89	95	91	F	95	89	81
2	C	82	83	77	G	83	82	78
3	D	70	66	70	H	66	70	90

	chemistry	math	name	physics
0	61	60	A	66
1	91	89	B	95
2	77	82	C	83
3	70	70	D	66
0	90	66	E	60
1	81	95	F	89
2	78	83	G	82
3	90	66	H	70