1# with the following function we can select highly correlated features
2# it will remove the first feature that is correlated with anything other feature
3
4def correlation(dataset, threshold):
5 col_corr = set() # Set of all the names of correlated columns
6 corr_matrix = dataset.corr()
7 for i in range(len(corr_matrix.columns)):
8 for j in range(i):
9 if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
10 colname = corr_matrix.columns[i] # getting the name of column
11 col_corr.add(colname)
12 return col_corr
13
14
15
16corr_features = correlation(X_train, 0.7)
17len(set(corr_features))