1+ # pylint: disable=C0103, too-few-public-methods, locally-disabled, no-self-use, unused-argument
2+ '''Implement a random forest from scratch'''
3+
4+ # Random Forest Algorithm on Sonar Dataset
5+ from random import seed
6+ from random import randrange
7+ from csv import reader
8+ from math import sqrt
9+
10+
11+
12+
13+ # Load a CSV file
14+ def load_csv (filename ):
15+ dataset = list ()
16+ with open (filename , 'r' ) as file :
17+ csv_reader = reader (file )
18+ for row in csv_reader :
19+ if not row :
20+ continue
21+ dataset .append (row )
22+ return dataset
23+
24+
25+ # Convert string column to float
26+ def str_column_to_float (dataset , column ):
27+ for row in dataset :
28+ row [column ] = float (row [column ].strip ())
29+
30+
31+ # Convert string column to integer
32+ def str_column_to_int (dataset , column ):
33+ class_values = [row [column ] for row in dataset ]
34+ unique = set (class_values )
35+ lookup = dict ()
36+ for i , value in enumerate (unique ):
37+ lookup [value ] = i
38+ for row in dataset :
39+ row [column ] = lookup [row [column ]]
40+ return lookup
41+
42+
43+ # Split a dataset into k folds
44+ def cross_validation_split (dataset , n_folds ):
45+ dataset_split = list ()
46+ dataset_copy = list (dataset )
47+ fold_size = int (len (dataset ) / n_folds )
48+ for i in range (n_folds ):
49+ fold = list ()
50+ while len (fold ) < fold_size :
51+ index = randrange (len (dataset_copy ))
52+ fold .append (dataset_copy .pop (index ))
53+ dataset_split .append (fold )
54+ return dataset_split
55+
56+
57+ # Calculate accuracy percentage
58+ def accuracy_metric (actual , predicted ):
59+ correct = 0
60+ for i in range (len (actual )):
61+ if actual [i ] == predicted [i ]:
62+ correct += 1
63+ return correct / float (len (actual )) * 100.0
64+
65+
66+ # Evaluate an algorithm using a cross validation split
67+ def evaluate_algorithm (dataset , algorithm , n_folds , * args ):
68+ folds = cross_validation_split (dataset , n_folds )
69+ scores = list ()
70+ for fold in folds :
71+ train_set = list (folds )
72+ train_set .remove (fold )
73+ train_set = sum (train_set , [])
74+ test_set = list ()
75+ for row in fold :
76+ row_copy = list (row )
77+ test_set .append (row_copy )
78+ row_copy [- 1 ] = None
79+ predicted = algorithm (train_set , test_set , * args )
80+ actual = [row [- 1 ] for row in fold ]
81+ accuracy = accuracy_metric (actual , predicted )
82+ scores .append (accuracy )
83+ return scores
84+
85+
86+ # Split a dataset based on an attribute and an attribute value
87+ def test_split (index , value , dataset ):
88+ left , right = list (), list ()
89+ for row in dataset :
90+ if row [index ] < value :
91+ left .append (row )
92+ else :
93+ right .append (row )
94+ return left , right
95+
96+
97+ # Calculate the Gini index for a split dataset
98+ def gini_index (groups , classes ):
99+ # count all samples at split point
100+ n_instances = float (sum ([len (group ) for group in groups ]))
101+ # sum weighted Gini index for each group
102+ gini = 0.0
103+ for group in groups :
104+ size = float (len (group ))
105+ # avoid divide by zero
106+ if size == 0 :
107+ continue
108+ score = 0.0
109+ # score the group based on the score for each class
110+ for class_val in classes :
111+ p = [row [- 1 ] for row in group ].count (class_val ) / size
112+ score += p * p
113+ # weight the group score by its relative size
114+ gini += (1.0 - score ) * (size / n_instances )
115+ return gini
116+
117+
118+ # Select the best split point for a dataset
119+ def get_split (dataset , n_features ):
120+ class_values = list (set (row [- 1 ] for row in dataset ))
121+ b_index , b_value , b_score , b_groups = 999 , 999 , 999 , None
122+ features = list ()
123+ while len (features ) < n_features :
124+ index = randrange (len (dataset [0 ])- 1 )
125+ if index not in features :
126+ features .append (index )
127+ for index in features :
128+ for row in dataset :
129+ groups = test_split (index , row [index ], dataset )
130+ gini = gini_index (groups , class_values )
131+ if gini < b_score :
132+ b_index , b_value , b_score , b_groups = index , row [index ], gini , groups
133+ return {'index' :b_index , 'value' :b_value , 'groups' :b_groups }
134+
135+
136+ # Create a terminal node value
137+ def to_terminal (group ):
138+ outcomes = [row [- 1 ] for row in group ]
139+ return max (set (outcomes ), key = outcomes .count )
140+
141+
142+ # Create child splits for a node or make terminal
143+ def split (node , max_depth , min_size , n_features , depth ):
144+ left , right = node ['groups' ]
145+ del (node ['groups' ])
146+ # check for a no split
147+ if not left or not right :
148+ node ['left' ] = node ['right' ] = to_terminal (left + right )
149+ return
150+ # check for max depth
151+ if depth >= max_depth :
152+ node ['left' ], node ['right' ] = to_terminal (left ), to_terminal (right )
153+ return
154+ # process left child
155+ if len (left ) <= min_size :
156+ node ['left' ] = to_terminal (left )
157+ else :
158+ node ['left' ] = get_split (left , n_features )
159+ split (node ['left' ], max_depth , min_size , n_features , depth + 1 )
160+ # process right child
161+ if len (right ) <= min_size :
162+ node ['right' ] = to_terminal (right )
163+ else :
164+ node ['right' ] = get_split (right , n_features )
165+ split (node ['right' ], max_depth , min_size , n_features , depth + 1 )
166+
167+
168+ # Build a decision tree
169+ def build_tree (train , max_depth , min_size , n_features ):
170+ root = get_split (train , n_features )
171+ split (root , max_depth , min_size , n_features , 1 )
172+ return root
173+
174+
175+ # Make a prediction with a decision tree
176+ def predict (node , row ):
177+ if row [node ['index' ]] < node ['value' ]:
178+ if isinstance (node ['left' ], dict ):
179+ return predict (node ['left' ], row )
180+ else :
181+ return node ['left' ]
182+ else :
183+ if isinstance (node ['right' ], dict ):
184+ return predict (node ['right' ], row )
185+ else :
186+ return node ['right' ]
187+
188+
189+ # Create a random subsample from the dataset with replacement
190+ def subsample (dataset , ratio ):
191+ sample = list ()
192+ n_sample = round (len (dataset ) * ratio )
193+ while len (sample ) < n_sample :
194+ index = randrange (len (dataset ))
195+ sample .append (dataset [index ])
196+ return sample
197+
198+
199+ # Make a prediction with a list of bagged trees
200+ def bagging_predict (trees , row ):
201+ predictions = [predict (tree , row ) for tree in trees ]
202+ return max (set (predictions ), key = predictions .count )
203+
204+
205+ # Random Forest Algorithm
206+ def random_forest (train , test , max_depth , min_size , sample_size , n_trees , n_features ):
207+ trees = list ()
208+ for i in range (n_trees ):
209+ sample = subsample (train , sample_size )
210+ tree = build_tree (sample , max_depth , min_size , n_features )
211+ trees .append (tree )
212+ predictions = [bagging_predict (trees , row ) for row in test ]
213+ return (predictions )
214+
215+
216+
217+
218+
219+
220+ # Test the random forest algorithm
221+ seed (2 )
222+ # load and prepare data
223+ filename = r'C:\development\python\opencvlib\test\bin\learning_data\sonar.all-data.csv'
224+ dataset = load_csv (filename )
225+ # convert string attributes to integers
226+ for i in range (0 , len (dataset [0 ])- 1 ):
227+ str_column_to_float (dataset , i )
228+ # convert class column to integers
229+ str_column_to_int (dataset , len (dataset [0 ])- 1 )
230+ # evaluate algorithm
231+ n_folds = 5
232+ max_depth = 10
233+ min_size = 1
234+ sample_size = 1.0
235+ n_features = int (sqrt (len (dataset [0 ])- 1 ))
236+ for n_trees in [1 , 5 , 10 ]:
237+ scores = evaluate_algorithm (dataset , random_forest , n_folds , max_depth , min_size , sample_size , n_trees , n_features )
238+ print ('Trees: %d' % n_trees )
239+ print ('Scores: %s' % scores )
240+ print ('Mean Accuracy: %.3f%%' % (sum (scores )/ float (len (scores ))))
0 commit comments