random forest from scratch python github

Solutions on MaxInterview for random forest from scratch python github by the best coders in the world

showing results for - "random forest from scratch python github"
Youna
06 Aug 2020
1import numpy as np
2from collections import Counter
3
4#implement decision tree
5def entropy(d):
6    c = np.bincount(d)
7    c = c[c != 0]
8    prop = c/len(d)
9    E = - prop*np.log2(prop)
10    return np.sum(E)
11
12class Node:
13    def __init__(self, beast_split_feature = None, threashold = None, left = None,right = None,*,value = None):
14        self.beast_split_feature = beast_split_feature
15        self.threashold  = threashold
16        self.left = left
17        self.right = right
18        self.value = value
19    def is_leaf(self):
20        return self.value is not None
21
22
23
24class Tree:
25    def __init__(self, min_split_sample = 2, max_depth = 100, n_feat = None):
26        self.min_split_sample = min_split_sample
27        self.max_depth = max_depth
28        self.n_feat = n_feat
29        self.root = None
30
31    def fit(self, X,y):
32        _, n_features = X.shape
33        self.n_feat = n_features if  not self.n_feat else min(n_features, self.n_feat)
34        self.root = self._grow_tree(X,y)
35
36
37    def predict(self, X):
38        return np.array([self._trav(x, self.root) for x in X])
39    
40    def _trav(self, x, node):
41        if node.is_leaf():
42            return node.value
43        elif x[node.beast_split_feature] <= node.threashold:
44            return self._trav(x, node.left)
45        return self._trav(x, node.right)
46
47
48    def _grow_tree(self, X, y, depth = 0):
49        _, n_features = X.shape
50        n_classes = len(np.unique(y))
51
52        #stopping cratiria
53        if (depth >= self.max_depth or n_classes <= self.min_split_sample
54        or n_classes == 1):
55            leaf_val = self._most_commen_val(y)
56            return Node(value=leaf_val)
57
58        feat_idx = np.random.choice(n_features, self.n_feat, replace=False)
59
60        best_threash, best_feat = self._best_criteria(X, y, feat_idx)
61        left_idx, right_idx = self._split(X[:, best_feat] ,best_threash)
62
63        righTree = self._grow_tree(X[right_idx, :], y[right_idx], depth + 1)
64        lefTree = self._grow_tree(X[left_idx, :], y[left_idx], depth + 1)
65        return Node(best_feat, best_threash, lefTree, righTree)
66
67
68
69    
70    def _best_criteria(self, X, y, feat_idxs):
71        best_gian = -1
72        split_idx, split_thrishold = None, None
73
74        for feat_idx in feat_idxs:
75            X_cloumnd = X[:, feat_idx]
76            thrisholds = np.unique(X_cloumnd)
77            for thrishold in thrisholds:
78                gain = self._info_gain(y, X_cloumnd, thrishold)
79
80                if gain > best_gian:
81                    best_gian = gain
82                    split_idx = feat_idx
83                    split_thrishold = thrishold
84        return split_thrishold, split_idx
85    
86    def _info_gain(self, y, X_cloumnd, thrishold):
87        #perant entropy
88        perant_entropy = entropy(y)
89        #split
90        left_idx, right_idx = self._split(X_cloumnd, thrishold)
91        if len(left_idx) ==0  or len(right_idx) ==0:
92            return 0
93        left_label = y[left_idx]
94        right_label =  y[right_idx]
95        l_n, r_n = len(left_label), len(right_label)
96        child_entropy = (l_n/len(y))*entropy(left_label) + (r_n/len(y))*entropy(right_label)
97
98        #wighted avg
99        return perant_entropy - child_entropy
100    
101    def _split(self, X_cloumnd, thrishold):
102        left = np.argwhere(X_cloumnd < thrishold).flatten()
103        right = np.argwhere(X_cloumnd >= thrishold).flatten()
104        return left, right
105    
106    def _most_commen_val(self, y):
107        c = Counter(y)
108        most = c.most_common(1)
109        return most[0][0]
110
111# implement Random Forest
112def bootstrab(X, y):
113    n_sample = X.shape[0]
114    idexs = np.random.choice(n_sample,n_sample, replace=True)
115    return X[idexs], y[idexs]
116
117def most_commen_val(y):
118    c = Counter(y)
119    most = c.most_common(1)
120    return most[0][0]
121
122class RandomForst:
123
124    def __init__(self, num_tree = 100, min_split_sample = 2, max_depth = 100, n_feat = None):
125        self.num_tree = num_tree
126        self.min_split_sample = min_split_sample
127        self.max_depth = max_depth
128        self.n_feat = n_feat
129
130    def fit(self, X, y):
131        self.tree = []
132        for _ in range(self.num_tree):
133            tree = Tree(self.min_split_sample, self.max_depth, self.n_feat)
134            x_sample, y_sample  = bootstrab(X, y)
135            tree.fit(x_sample, y_sample)
136            self.tree.append(tree)
137        
138    def predict(self, X):
139        tree_predict = np.array([tree.predict(X) for tree in self.tree])
140        tree_predict = np.swapaxes(tree_predict, 0, 1)
141        y_pred = [most_commen_val(y) for y in tree_predict]
142        return y_pred
143