Get differences between two pandas DataFrames
# Import libraries
import pandas as pd
import numpy as np
# Create sample dataframes
df1 = pd.DataFrame({'city': ['Paris', 'Lyon', 'Marseille', 'Lille', 'Strasbourg'],
'population': [2148000, 513300, 861635, 232741, 277270]})
df2 = df1.copy()
df2.iloc[1,1] = 0
df2
city | population | |
---|---|---|
0 | Paris | 2148000 |
1 | Lyon | 0 |
2 | Marseille | 861635 |
3 | Lille | 232741 |
4 | Strasbourg | 277270 |
Check if DataFrames are identical
# Check if dataframes are identical
df1.equals(df2)
False
Get delta rows between DataFrames
# Get rows that are different between dataframes, with `drop_duplicates()`
pd.concat([df1,df2]).drop_duplicates(keep=False)
city | population | |
---|---|---|
1 | Lyon | 513300 |
1 | Lyon | 0 |
# Get delta rows with `merge()`
df1.merge(df2, how='outer', indicator=True).loc[lambda x: x['_merge'] != "both"]
city | population | _merge | |
---|---|---|---|
1 | Lyon | 513300 | left_only |
5 | Lyon | 0 | right_only |
Get common rows between DataFrames
# Get common rows between dataframes
df1.merge(df2, how='inner')
city | population | |
---|---|---|
0 | Paris | 2148000 |
1 | Marseille | 861635 |
2 | Lille | 232741 |
3 | Strasbourg | 277270 |