4
\$\begingroup\$

I've been working on random forest algorithm for classification with roulette selection to find best splits. I came up with this homebrew based on this article https://machinelearningmastery.com/implement-random-forest-scratch-python/.

It works well enough, but I have a major problem with execution time. It's reasonably fast for 1-6 trees, but it completely chokes on 7 and more. I'm looking for ways to make it more efficient, so it doesn't take a month to finish basic tests. I'm a beginner, and I don't know much about optimizing code, so I'd appreciate any suggestions on how I could improve. Here's the link to Github repo: https://github.com/pdabrow11/Random-Forest-Roulette. It contains all .py files and datasets used for classification. Here's the raw code:

  • data_load.py
import csv

def read_file(file):
    with open(file, 'r') as myfile:
        data = myfile.readlines()
    return data

def load_data(data, var):
    dataset = list()
    for row in data:
        row = row.rstrip("\n")
        if var == 1:
            dataset.append(row.split(","))
        if var == 2:
            dataset.append(row.split(" "))
        if var == 3:
            dataset.append(row.split(";"))
    return dataset


def process_dat_data(data):
    dataset = list()
    for row in data:
        row_temp = list()
        for i in range(1, len(row)):
            res = row[i].find(':')
            row_temp.append(row[i][res+1:])
        row_temp.append(row[0])
        dataset.append(row_temp)
    return dataset

def str2int(dataset):
    for i in range(0, len(dataset[0])):
        class_values = [row[i] for row in dataset]
        unique = set(class_values)
        lookup = dict()
        for j, value in enumerate(unique):
            lookup[value] = float(j)
        for row in dataset:
            row[i] = lookup[row[i]]

def str2float(dataset):
    for row in dataset:
        for i in range(0, len(row)):
            row[i] = float(row[i])
    return dataset

def write_data(data):
    with open("tests_res.csv", 'a') as file:
        f = csv.writer(file, delimiter =',', lineterminator='\n')
        f.writerows(data)
  • random_forest.py
from math import sqrt
from random import seed
from random import randrange
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
import numpy as np
import numpy.random as npr

def cross_validation_split(dataset, folds_num):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds_num)
    for i in range(folds_num):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
def calc_accuracy(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
 
def evaluate_algorithm(dataset, folds_num, trees_num, *args):
    attributes_num = int(sqrt(len(dataset[0])-1))
    folds = cross_validation_split(dataset, folds_num)
    scores = list()
    scores_sklearn = list()
    res = RandomForestClassifier(n_estimators=trees_num)
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        y_test_set = list()
        X, y = list(), list()
        for row in fold:
            #row_copy = list(row)
            #row_copy[-1] = None
            test_set.append(row[:-1])
            y_test_set.append(row[-1])
        predicted = random_forest(train_set, test_set, trees_num, attributes_num, *args)

        actual = [row[-1] for row in fold]
        accuracy = calc_accuracy(actual, predicted)
        scores.append(accuracy)

        # Sklearn
        for row in train_set:
            X.append(row[:-1])
            y.append(row[-1])
        #res = clone(res)
        res.fit(X, y)
        accuracy_sklearn = res.score(test_set, y_test_set)*100
        scores_sklearn.append(accuracy_sklearn)
    return scores, scores_sklearn
 
def split_attribute(dataset, index, value):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right
 
def subset(dataset, ratio):
    sample = list()
    sample_num = round(len(dataset) * ratio)
    while len(sample) < sample_num:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample

def gini_index(groups, classes):
    examples_num = int(sum([len(group) for group in groups]))
    gini = 0.0
    for group in groups:
        size = int(len(group))
        if size == 0:
            continue
        P, E = 0.0, 1.0
        for single_class in classes:
            P = [row[-1] for row in group].count(single_class) / size
            E -= P ** 2
        gini += (size / examples_num) * E
    return gini
 
def roulette_split(dataset, attributes_num, threshold):
    classes = list(set(row[-1] for row in dataset))

    index_list, val_list, group_list, fit = [], [], [], []
    attributes = list()
    while len(attributes) < attributes_num:
        index = randrange(len(dataset[0])-1)
        if index not in attributes:
            attributes.append(index)
    counter = 0
    for index in attributes:
        for row in dataset:
            groups = split_attribute(dataset, index, row[index])
            gini = gini_index(groups, classes)
            index_list.append(index)
            val_list.append(row[index])
            group_list.append(groups)
            fit.append(1-gini)
            counter += 1
    wheel_size = 0
    fit_args_sorted = np.argsort(fit)
    for i in range (0, int(threshold*counter)):
        fit[fit_args_sorted[i]] = 0
    for i in range (0, counter):
        wheel_size += fit[i]
    
    selection_probs = [fit[i]/wheel_size for i in range (0, counter)]

    winner = int(npr.choice(np.arange(counter), 1, p=selection_probs))
    return {'val':val_list[winner], 'groups':group_list[winner], 'index':index_list[winner]}

def terminal(group):
    results = [row[-1] for row in group]
    return max(results, key=results.count)

def one_class(node):
    res = True
    for i in range(0, len(node)):
        if node[0][-1] != node[i][-1]:
                res = False
                return res
    return res

def are_the_same(node):
    res = True
    for i in range(0, len(node[0])-1):
        for j in range(0, len(node)):
            for k in range(0, len(node)):
                if node[j][i] != node[k][i]:
                    res = False
                    return res
    return res

def split(node, attributes_num, depth, threshold):
    left, right = node['groups']
    del(node['groups'])

    if not left or not right:
        node['left'] = node['right'] = terminal(left + right)
        return

    if one_class(left) or are_the_same(left):
        node['left'] = terminal(left)
    else:
        node['left'] = roulette_split(left, attributes_num, threshold)
        split(node['left'], attributes_num, depth+1, threshold)

    if one_class(right) or are_the_same(right):
        node['right'] = terminal(right)
    else:
        node['right'] = roulette_split(right, attributes_num, threshold)
        split(node['right'], attributes_num, depth+1, threshold)
 

def build_tree(train, attributes_num, threshold):
    root = roulette_split(train, attributes_num, threshold)
    split(root, attributes_num, 1, threshold)
    return root
 

def predict(node, row):
    if row[node['index']] < node['val']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']
 

def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)
 

def random_forest(train, test, attributes_num, trees_num, sample_size, threshold):
    trees = list()
    for i in range(trees_num):
        sample = subset(train, sample_size)
        tree = build_tree(sample, attributes_num, threshold)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return(predictions)
  • launch.py
from asyncore import write
from random_forest import *
from data_load import *

from random_forest import *
from data_load import *

def main():

    data_absence = read_file("datasets_v2/Absenteeism_at_work.csv")
    dataset_absence = load_data(data_absence, 3)
    dataset_absence = dataset_absence[1:]
    str2float(dataset_absence)

    data_car = read_file("datasets_v2/car.data")
    dataset_car = load_data(data_car, 1)
    str2int(dataset_car)
    
    data_opt = read_file("datasets_v2/batch_optdigits.tra")
    dataset_opt = load_data(data_opt, 1)
    str2float(dataset_opt)

    data_gas = read_file("datasets_v2/GasSensorDataset/batch1.dat")
    dataset_gas = load_data(data_gas, 2)
    dataset_gas = process_dat_data(dataset_gas)
    str2float(dataset_gas)
    
    sk_scores = []
    for_scores = []
    col_1 = []
    col_2 = []
    col_3 = []
    col_4 = []
    col_5 = []
    col_6 = []
    col_7 = []
    col_8 = []


    for dataset in [dataset_car, dataset_absence, dataset_gas]:
        sample_size = 0.05
        folds_num = 5
        for threshold in [0.2 , 0.5, 0.9]:
            print('Dla mojej informacji - threshold: ', threshold, '\n')
            for trees_num in range(1,10):
                sk_scores.clear()
                for_scores.clear()
                for i in [1,2,3,5]:
                    seed(i)
                    scores = evaluate_algorithm(dataset, folds_num, trees_num, sample_size, threshold)
                    score = sum(scores[0])/float(len(scores[0]))
                    for_scores.append(score)
                    sk_score = sum(scores[1])/float(len(scores[1]))
                    sk_scores.append(sk_score)
                if dataset == dataset_car:
                    col_1.append('SECOM Dataset')
                    col_1.append('SECOM Dataset')
                elif dataset == dataset_gas:
                    col_1.append('Gas Sensor Dataset')
                    col_1.append('Gas Sensor Dataset')
                else:
                    col_1.append('Absenteeism Dataset')
                    col_1.append('Absenteeism Dataset')
                for_mean = round(np.mean(for_scores),2)
                for_stdev = round(np.std(for_scores),2)
                for_best = round(np.amax(for_scores),2)
                for_worst = round(np.amax(for_scores),2)
                sk_mean = round(np.mean(sk_scores),2)
                sk_stdev = round(np.std(sk_scores),2)
                sk_best = round(np.amax(sk_scores),2)
                sk_worst = round(np.amax(sk_scores),2)
                col_2.append('Własna implementacja')
                col_2.append('Sklearn')
                col_3.append(trees_num)
                col_3.append(trees_num)
                col_4.append(threshold)
                col_4.append(threshold)
                col_5.append(for_mean)
                col_6.append(for_stdev)
                col_7.append(for_best)
                col_8.append(for_worst)
                col_5.append(sk_mean)
                col_6.append(sk_stdev)
                col_7.append(sk_best)
                col_8.append(sk_worst)
                print(trees_num)
    header = ['Zbiór danych', 'Implementacja', 'Ilość drzew', 'Próg selekcji', 'Średnia jakość', 'Odchylenie standardowe', 'Najlepsza jakość', 'Najgorsza jakość']
    file_data = np.column_stack((col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8))
    file_data = np.vstack((header, file_data))
    open('tests_res.csv', 'w').close()
    write_data(file_data)

if __name__ == "__main__":
    main()
\$\endgroup\$

1 Answer 1

1
\$\begingroup\$

Documentation

The PEP 8 style guide recommends adding docstrings for files and functions.

For example, for the data_load.py file, since "data" is a generic term, it would be good to describe what kind of data is being loaded.

For functions, it would be useful for the docstring to have details about input types and return types.

Naming

There are many variables with the generic word "data". I suggest renaming some of them to be more specific about the type of data they represent.

The function named process_dat_data even has "dat" twice. That seems redundant, and one of them can be replaced with a better word.

In the load_data function, the var input does not convey much meaning. It might be better as separator_type.

Efficiency

In the load_data function for loop, the 3 separate if statements should be combined into a single if/elif statement:

if var == 1:
    dataset.append(row.split(","))
elif var == 2:
    dataset.append(row.split(" "))
elif var == 3:

The checks are mutually exclusive. This makes the code more efficient since you don't have to perform the 2nd check if the first is true, etc. Also, this more clearly shows the intent of the code.

An alternate approach is to avoid the intermediate var mapping variable, and directly pass in the separator:

def load_data(data, separator):
    dataset = list()
    for row in data:
        row = row.rstrip("\n")
        if separator in [",", " ", ";"]:
            dataset.append(row.split(separator))
    return dataset

DRY

In the launch.py file, these 2 lines are repeated:

from random_forest import *
from data_load import *

You should delete the copies.

In the main function, there is a lot of repetitious calls to append:

if dataset == dataset_car:
    col_1.append('SECOM Dataset')
    col_1.append('SECOM Dataset')
elif dataset == dataset_gas:
    col_1.append('Gas Sensor Dataset')
    col_1.append('Gas Sensor Dataset')
else:
    col_1.append('Absenteeism Dataset')
    col_1.append('Absenteeism Dataset')

They can be factored out of the if/else

text = ''
if dataset == dataset_car:
    text = 'SECOM'
elif dataset == dataset_gas:
    text = 'Gas Sensor'
else:
    text = 'Absenteeism'
for _ in range(2):
    col_1.append(f"{text} Dataset")

Simpler

In the random_forest.py file, the one_class can be simplified by removing the res variable:

def one_class(node):
    for i in range(len(node)):
        if node[0][-1] != node[i][-1]:
            return False
    return True

The range start 0 can also be removed.

The same can be done for the are_the_same function.

\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.