Pandas tips cool

Một số tricks cơ bản trong pandas

Import thư viện trước khi thực hiện code

import time
import threading
import concurrent.futures
import multiprocessing

Load dữ liệu

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/dophuchao/dophuchao.github.io/master/data/interviews.csv')
print(df.shape)
df.head(3)

Thay đổi dữ liệu a_ij hoặc cột nào đó

# use loc (function cua dataframe)
df.loc[0, 'Review'] = 'Orange is love. Orange is life' # row 0, col Review
df.head(3)

# change the company name Apple to Orange
#df['Company'] == 'Apple': tat ca cac hang cua cot "Company" la 'Apple'
df.loc[df['Company'] == 'Apple', 'Company'] = 'Orange'
df.head(3)

# set raise an error
pd.set_option('mode.chained_assignment', 'raise')
#df['Review'][0] = "I like Orange better"

Indexing and slicing

df.iloc[1] #hang thu 1

df.iloc[-6:] #6 hang cuoi cung

df.iloc[-6::2] #6 hang cuoi cung, nhay buoc 2

Select rows

#selecting rows
df.loc[df['Offer'] == 'Declined offer']
# or can write
mask = df['Offer'] == 'Declined offer'
df.loc[mask]

Create labels

# creating labels
def company_type(x):
    hardware_companies = set(['Orange', 'Dell', 'IBM', 'Siemens'])
    return 'Hardware' if x['Company'] in hardware_companies else 'Software'

df['Type'] = df.apply(lambda x : company_type(x), axis = 1)
df = df.set_index('Type')

df.loc['Hardware']

Drop a label

#to drop a labels
df.reset_index(drop=True, inplace=True)
df

String accessor

#string accessor
df['Review'].str.lower()

df.Review.str.len()

Thêm cột dữ liệu sử dụng regex

# them cot Process check str (su dung regex)
df.loc[df['Review'].str.contains('days'), 'Process'] = 'Short'
df.loc[df['Review'].str.contains('week'), 'Process'] = 'Average'
df.loc[df['Review'].str.contains('month|[4-9]+[^ ]* week|[1-9]\d{1,}[^ ]* weeks'), 'Process'] = 'Long'
#set option max col width
pd.set_option('display.max_colwidth', 100)
df[~df.Process.isna()][['Review', 'Process']]

Một số hàm dựng sẵn của str

# mot so ham xay dung san cua str
print(pd.Series.str.__dict__.keys())

Data exploration

df.tail(8) # lay 8 cai cuoi

Thống kê các thông tin về cột

# generate statistics about numeric columns
df.describe()

show non-null count and type of all columns

#show non-null count and type of all columns
df.info()

count unique values

#count unique values
df.Company.nunique()

count of Company

#count of Company
df.Company.value_counts()

Groupby and plot

# groupby the df by 'Company' and 'Process', count the number of elements
# then unstack by 'Process' (index 1), plot a bar char

df.groupby(['Company', 'Process']).size().unstack(level=1).plot(kind = 'bar', figsize=(15, 8))

change drop columns

#to make changes to df, set 'inplace = True'
df.drop(columns=['Process'], inplace=True)
df.columns

Link tham khảo

Full ipynb

Tài liệu tham khảo

Machine learning cơ bản

Hết.

Pandas tips cool!

Pandas tips cool

Load dữ liệu

Thay đổi dữ liệu a_ij hoặc cột nào đó

Indexing and slicing

Select rows

Create labels

Drop a label

String accessor

Thêm cột dữ liệu sử dụng regex

Một số hàm dựng sẵn của str

Data exploration

Thống kê các thông tin về cột

show non-null count and type of all columns

count unique values

Groupby and plot

change drop columns

Link tham khảo

Tài liệu tham khảo

CATALOG

FEATURED TAGS

LINKS