GA_FS_Algo.py

# -*- coding: utf-8 -*-
"""DatasetFSGA.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/12xSP2KvDZP3sQCsdUAneKMM7ohQMCQJs
"""

import numpy as np
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel, RFE
from deap import algorithms, base, creator, tools
import random
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from google.colab import files
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Loading  the data
# Used to load data on Google Colab
uploaded = files.upload()

df = pd.read_csv('data.csv')
df = df.dropna(axis=1)
X = df.iloc[:, 2:31].values
y = df.iloc[:, 1].values


# We need to split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transforming "M" and "B" labels to 0 and 1
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

#Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Creating a classifier to be used in the feature selection process
classifier = SVC(kernel='linear')

# and then defining the fitness function
def evaluate_fitness(individual):
    selected_features = [bool(bit) for bit in individual]
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    classifier.fit(X_train_selected, y_train)
    y_pred = classifier.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy,

# Defining the individual and population
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Initializing the toolbox
toolbox = base.Toolbox()
toolbox.register("attribute", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attribute, n=len(X[0]))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Defining the genetic operators
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate_fitness)

# Seting the parameters for the genetic algorithm
population_size = 50
generations = 40
cxpb = 0.5
mutpb = 0.2

# Creating the initial population
population = toolbox.population(n=population_size)

# Runing the genetic algorithm
best_individual = None
best_fitness = float("-inf")
for gen in range(generations):
    offspring = algorithms.varAnd(population, toolbox, cxpb=cxpb, mutpb=mutpb)
    fitness_values = toolbox.map(toolbox.evaluate, offspring)
    for ind, fitness in zip(offspring, fitness_values):
        ind.fitness.values = fitness
        if fitness[0] > best_fitness:
            best_individual = ind
            best_fitness = fitness[0]
    population = toolbox.select(offspring, k=len(population))

# We need to get the selected features from the best individual
selected_features = [bool(bit) for bit in best_individual]

# now select the corresponding features from the training and testing sets
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# Converting to pandas DataFrame
X_train_df = pd.DataFrame(X_train)
print('Type: ', type(X_train_df))
print('Type: ', type(X_train))

# Training SVM classifier on the transformed data
clf = svm.SVC(kernel = 'rbf', random_state = 0)
clf.fit(X_train_selected, y_train)

# Predicting the class labels on the testing set
y_pred = clf.predict(X_test_selected)

# Using the RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
clf_RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
clf_RF.fit(X_train_selected, y_train)

# Predicting the class labels on the testing set
y_pred_RF = clf_RF.predict(X_test_selected)

#Using DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(X_train_selected, y_train)
# Predicting the class labels on the testing set
y_pred_DT = tree.predict(X_test_selected)

n_components = 2


model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compiling the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Training the ANN model
model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=0)

# Evaluating the ANN model
ann_predictions = (model.predict(X_test) > 0.5).astype(int).flatten()

# Calculating the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
accuracy_RF = accuracy_score(y_test, y_pred_RF)
accuracy_DT = accuracy_score(y_test, y_pred_DT)
accuracy_ANN = accuracy_score(y_test, ann_predictions)

print("Accuracy SVM:", accuracy)
print("Accuracy RF:", accuracy_RF)
print("Accuracy DT:", accuracy_DT)
print("Accuracy ANN:", accuracy_ANN)

cm = confusion_matrix(y_test, ann_predictions)

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]

print(cm)
print('Testing Accuracy = "{}!"'.format((TP + TN) / (TP + TN + FN + FP)))
print()

# We need to Build the MLP classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(64, 64), activation='relu', solver='adam', max_iter=100, random_state=0)
mlp_classifier.fit(X_train, y_train)
mlp_predictions = mlp_classifier.predict(X_test)

# And then calculate recall and precision for MLP
mlp_recall, mlp_precision, _, _ = precision_recall_fscore_support(y_test, mlp_predictions, average='binary')

# Printing recall and precision for MLP
print("Recall MLP:", mlp_recall)
print("Precision MLP:", mlp_precision)

# Calculating recall and precision for other classifiers using classification_report
print("\nClassification Report SVM:\n", classification_report(y_test, y_pred, digits=4))
print("Classification Report RF:\n", classification_report(y_test, y_pred_RF, digits=4))
print("Classification Report DT:\n", classification_report(y_test, y_pred_DT, digits=4))
print("Classification Report ANN:\n", classification_report(y_test, ann_predictions, digits=4))
print("Classification Report MLP:\n", classification_report(y_test, mlp_predictions, digits=4))