1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
| def get_dtypes(data, drop_col = []):
# list columns kieu str, list columns kieu number, list all
# data: dataframe
# drop_col: bo mot so cot tuong ung.
name_of_col = list(data.columns)
num_var_list = []
str_var_list = []
all_var_list = []
str_var_list = name_of_col.copy()
for var in name_of_col:
# check if column belongs to numeric type
if data[var].dtypes in (np.int64, np.int32, np.float64, np.float32, np.double):
str_var_list.remove(var)
num_var_list.append(var)
# drop the omit column from list
for var in drop_col:
if var in str_val_list:
str_val_list.remove(var)
if var in num_var_list:
num_var_list.remove(var)
all_var_list.extend(str_var_list)
all_var_list.extend(num_var_list)
return str_var_list, num_var_list, all_var_list
def describe(data, output_path = None):
res = data.describe(include = 'all')
if output_path is not None:
output = os.path.join(output_path, 'describe.csv')
res.to_csv(output)
print('result saved at: ' + str(output))
return res
def discrete_var_barplot(x, y, data, output_path = None):
# ve bieu do bien roi rac x doi voi ham muc tieu y
# default: bars show mean value of y
plt.figure(figsize = (15, 10))
sns.barplot(x = x, y = y, data = data)
if output_path is not None:
output = os.path.join(output_path, 'Barplot_' + str(x) + '_' + str(y) + '.png')
plt.savefig(output)
print('image saved at', str(output))
def discrete_var_countplot(x, data, output_path = None):
# count value x
plt.figure(figsize = (15, 10))
sns.countplot(x = x, data = data)
if output_path is not None:
output = os.path.join(output_path, 'Countplot_' + str(x) + '.png')
plt.savefig(output)
print('Image saved at', str(output))
def discrete_var_boxplot(x, y, data, output_path = None):
# ve bieu do bien roi rac x doi voi ham muc tieu y
# default: bars show mean value of y
plt.figure(figsize = (15, 10))
sns.boxplot(x = x, y = y, data = data)
if output_path is not None:
output = os.path.join(output_path, 'Boxplot_' + str(x) + '_' + str(y) + '.png')
plt.savefig(output)
print('image saved at', str(output))
def continuous_var_histplot(x, output_path = None, bins = None):
# draw the histogram of a continuous variable x
plt.figure(figsize = (15, 10))
sns.histplot(data = x)
if output_path is not None:
output = os.path.join(output_path, 'Histplot_' + str(x.name) + '.png')
plt.savefig(output)
print('Image saved at', str(output))
def scatter_plot(x, y, data, output_path = None):
plt.figure(figsize = (15, 10))
sns.scatterplot(x = x, y = y, data = data)
if output_path is not None:
output = os.path.join(output_path, 'Scatter_plot_' + str(x.name) + '_' + str(y.name) + '.png')
plt.savefig(output)
print('Image saved at', str(output))
def correlation_plot(data, output_path = None):
corrmat = data.corr() # correlation matrix
fig, ax = plt.subplots()
fig.set_size_inches(11, 11)
sns.heatmap(corrmat, cmap = 'YlGnBu', linewidths = 0.5, annot = True)
if output_path is not None:
output = os.path.join(output_path, 'Corr_plot' + '.png')
plt.savefig(output)
print('Image saved at', str(output))
def heatmap(data, output_path = None, fmt = 'd'):
fig, ax = plt.subplots()
fig.set_size_inches(11, 11)
sns.heatmap(data, cmap = 'YlGnBu', linewidths = .5, annot = True, fmt = fmt)
if output_path is not None:
output = os.path.join(output_path, 'Heatmap' + '.png')
plt.savefig(output)
print('Image saved at', str(output))
|