1# with the following function we can select highly correlated features
2# it will remove the first feature that is correlated with anything other feature
3
4def correlation(dataset, threshold):
5    col_corr = set()  # Set of all the names of correlated columns
6    corr_matrix = dataset.corr()
7    for i in range(len(corr_matrix.columns)):
8        for j in range(i):
9            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
10                colname = corr_matrix.columns[i]  # getting the name of column
11                col_corr.add(colname)
12    return col_corr
13
14
15
16corr_features = correlation(X_train, 0.7)
17len(set(corr_features))