Pandas tips cool
Một số tricks cơ bản trong pandas
Import thư viện trước khi thực hiện code
1
2
3
4
| import time
import threading
import concurrent.futures
import multiprocessing
|
Load dữ liệu
1
2
3
4
| import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dophuchao/dophuchao.github.io/master/data/interviews.csv')
print(df.shape)
df.head(3)
|
data:image/s3,"s3://crabby-images/bfd75/bfd7532b85e9e0d15b1308d7f7f5f7baeb5851c5" alt="img"
Thay đổi dữ liệu a_ij hoặc cột nào đó
1
2
3
4
5
6
7
8
| # use loc (function cua dataframe)
df.loc[0, 'Review'] = 'Orange is love. Orange is life' # row 0, col Review
df.head(3)
# change the company name Apple to Orange
#df['Company'] == 'Apple': tat ca cac hang cua cot "Company" la 'Apple'
df.loc[df['Company'] == 'Apple', 'Company'] = 'Orange'
df.head(3)
|
data:image/s3,"s3://crabby-images/adf3c/adf3ce7f1b66fe4f9d59bde3791f312870ed79d6" alt="img"
1
2
3
| # set raise an error
pd.set_option('mode.chained_assignment', 'raise')
#df['Review'][0] = "I like Orange better"
|
Indexing and slicing
data:image/s3,"s3://crabby-images/36f76/36f765adfcf9004aa6a157045b9d69ad616c56ec" alt="img"
1
| df.iloc[-6:] #6 hang cuoi cung
|
data:image/s3,"s3://crabby-images/757e7/757e72b59b0ede768e9bee7ec4b256b58dc6cf05" alt="img"
1
| df.iloc[-6::2] #6 hang cuoi cung, nhay buoc 2
|
data:image/s3,"s3://crabby-images/e8ecc/e8ecce24fd1c0ebec225dc95815b784cddb60187" alt="img"
Select rows
1
2
3
4
5
| #selecting rows
df.loc[df['Offer'] == 'Declined offer']
# or can write
mask = df['Offer'] == 'Declined offer'
df.loc[mask]
|
data:image/s3,"s3://crabby-images/28dad/28dad97a41d62d4a0111cda5c63fe70ead44d9b3" alt="img"
Create labels
1
2
3
4
5
6
7
8
9
| # creating labels
def company_type(x):
hardware_companies = set(['Orange', 'Dell', 'IBM', 'Siemens'])
return 'Hardware' if x['Company'] in hardware_companies else 'Software'
df['Type'] = df.apply(lambda x : company_type(x), axis = 1)
df = df.set_index('Type')
df.loc['Hardware']
|
data:image/s3,"s3://crabby-images/bea84/bea84edcf28282ab930232fa3b0c3b9f01a88dde" alt="img"
Drop a label
1
2
3
| #to drop a labels
df.reset_index(drop=True, inplace=True)
df
|
data:image/s3,"s3://crabby-images/1331c/1331c7eca9d6281577f965a3802c11e3fd89094f" alt="img"
String accessor
1
2
| #string accessor
df['Review'].str.lower()
|
data:image/s3,"s3://crabby-images/ec865/ec865b6274ef383e5db923bf503fc7143241e19f" alt="img"
data:image/s3,"s3://crabby-images/f1ac1/f1ac19b7157d1175464f1183b99314829a1a3983" alt="img"
Thêm cột dữ liệu sử dụng regex
1
2
3
4
5
6
7
| # them cot Process check str (su dung regex)
df.loc[df['Review'].str.contains('days'), 'Process'] = 'Short'
df.loc[df['Review'].str.contains('week'), 'Process'] = 'Average'
df.loc[df['Review'].str.contains('month|[4-9]+[^ ]* week|[1-9]\d{1,}[^ ]* weeks'), 'Process'] = 'Long'
#set option max col width
pd.set_option('display.max_colwidth', 100)
df[~df.Process.isna()][['Review', 'Process']]
|
data:image/s3,"s3://crabby-images/fcbdc/fcbdce8fae8e037b90248f4ad8714b1cd5ac6cbf" alt="img"
Một số hàm dựng sẵn của str
1
2
| # mot so ham xay dung san cua str
print(pd.Series.str.__dict__.keys())
|
data:image/s3,"s3://crabby-images/6ed0f/6ed0fb5733d5c49e1626b89a43bbf358f7acfc3a" alt="img"
Data exploration
1
| df.tail(8) # lay 8 cai cuoi
|
data:image/s3,"s3://crabby-images/ceb30/ceb301c0a3c38433c75fc6bd8c03833ff0463280" alt="img"
Thống kê các thông tin về cột
1
2
| # generate statistics about numeric columns
df.describe()
|
data:image/s3,"s3://crabby-images/8c39a/8c39a943d95b2d4d49c83a0b673d17e14a519877" alt="img"
show non-null count and type of all columns
1
2
| #show non-null count and type of all columns
df.info()
|
data:image/s3,"s3://crabby-images/1f1d8/1f1d828a5c2f71b2de7d7e2569511d94fbf896ad" alt="img"
count unique values
1
2
| #count unique values
df.Company.nunique()
|
count of Company
1
2
| #count of Company
df.Company.value_counts()
|
data:image/s3,"s3://crabby-images/28e22/28e222bb156c86f848502b8df03fdf528eccdebb" alt="img"
Groupby and plot
1
2
3
4
| # groupby the df by 'Company' and 'Process', count the number of elements
# then unstack by 'Process' (index 1), plot a bar char
df.groupby(['Company', 'Process']).size().unstack(level=1).plot(kind = 'bar', figsize=(15, 8))
|
data:image/s3,"s3://crabby-images/5f5a6/5f5a63f7b65b2659e8ef43a24d52983abd700528" alt="img"
change drop columns
1
2
3
| #to make changes to df, set 'inplace = True'
df.drop(columns=['Process'], inplace=True)
df.columns
|
data:image/s3,"s3://crabby-images/c40d2/c40d2135cb276a235a984a297447ae0b200204e2" alt="img"
Link tham khảo
Full ipynb
Tài liệu tham khảo
Machine learning cơ bản
Hết.