Source code for pyexplainer.pyexplainer_pyexplainer

import copy
import math
import os
import random
import re
import sys
import string
import warnings
import ipywidgets as widgets
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
from IPython.core.display import display, HTML
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state, all_estimators
from .rulefit import RuleFit
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pickle


[docs]def AutoSpearman(X_train, correlation_threshold=0.7, correlation_method='spearman', VIF_threshold=5): """An automated feature selection approach that address collinearity and multicollinearity. For more information, please kindly refer to the `paper <https://ieeexplore.ieee.org/document/8530020>`_. Parameters ---------- X_train : :obj:`pd.core.frame.DataFrame` The X_train data to be processed correlation_threshold : :obj:`float` Threshold value of correalation. correlation_method : :obj:`str` Method for solving the correlation between the features. VIF_threshold : :obj:`int` Threshold value of VIF score. """ X_AS_train = X_train.copy() AS_metrics = X_AS_train.columns count = 1 # (Part 1) Automatically select non-correlated metrics based on a Spearman rank correlation test. print('(Part 1) Automatically select non-correlated metrics based on a Spearman rank correlation test') while True: corrmat = X_AS_train.corr(method=correlation_method) top_corr_features = corrmat.index abs_corrmat = abs(corrmat) # identify correlated metrics with the correlation threshold of the threshold highly_correlated_metrics = ((corrmat > correlation_threshold) | (corrmat < -correlation_threshold)) & ( corrmat != 1) n_correlated_metrics = np.sum(np.sum(highly_correlated_metrics)) if n_correlated_metrics > 0: # find the strongest pair-wise correlation find_top_corr = pd.melt(abs_corrmat, ignore_index=False) find_top_corr.reset_index(inplace=True) find_top_corr = find_top_corr[find_top_corr['value'] != 1] top_corr_index = find_top_corr['value'].idxmax() top_corr_i = find_top_corr.loc[top_corr_index, :] # get the 2 correlated metrics with the strongest correlation correlated_metric_1 = top_corr_i[0] correlated_metric_2 = top_corr_i[1] print('> Step', count, 'comparing between', correlated_metric_1, 'and', correlated_metric_2) # compute their correlation with other metrics outside of the pair correlation_with_other_metrics_1 = np.mean(abs_corrmat[correlated_metric_1][ [i for i in top_corr_features if i not in [correlated_metric_1, correlated_metric_2]]]) correlation_with_other_metrics_2 = np.mean(abs_corrmat[correlated_metric_2][ [i for i in top_corr_features if i not in [correlated_metric_1, correlated_metric_2]]]) print('>>', correlated_metric_1, 'has the average correlation of', np.round(correlation_with_other_metrics_1, 3), 'with other metrics') print('>>', correlated_metric_2, 'has the average correlation of', np.round(correlation_with_other_metrics_2, 3), 'with other metrics') # select the metric that shares the least correlation outside of the pair and exclude the other if correlation_with_other_metrics_1 < correlation_with_other_metrics_2: exclude_metric = correlated_metric_2 else: exclude_metric = correlated_metric_1 print('>>', 'Exclude', exclude_metric) count = count + 1 AS_metrics = list(set(AS_metrics) - set([exclude_metric])) X_AS_train = X_AS_train[AS_metrics] else: break print('According to Part 1 of AutoSpearman,', AS_metrics, 'are selected.') # (Part 2) Automatically select non-correlated metrics based on a Variance Inflation Factor analysis. print('(Part 2) Automatically select non-correlated metrics based on a Variance Inflation Factor analysis') # Prepare a dataframe for VIF X_AS_train = add_constant(X_AS_train) selected_features = X_AS_train.columns count = 1 while True: # Calculate VIF scores vif_scores = pd.DataFrame([variance_inflation_factor(np.array(X_AS_train.values, dtype=float), i) for i in range(X_AS_train.shape[1])], index=X_AS_train.columns) # Prepare a final dataframe of VIF scores vif_scores.reset_index(inplace=True) vif_scores.columns = ['Feature', 'VIFscore'] vif_scores = vif_scores.loc[vif_scores['Feature'] != 'const', :] vif_scores.sort_values(by=['VIFscore'], ascending=False, inplace=True, kind='mergesort') # Find features that have their VIF scores of above the threshold filtered_vif_scores = vif_scores[vif_scores['VIFscore'] >= VIF_threshold] # Terminate when there is no features with the VIF scores of above the threshold if len(filtered_vif_scores) == 0: break # exclude the metric with the highest VIF score metric_to_exclude = list(filtered_vif_scores['Feature'].head(1))[0] print('> Step', count, '- exclude', str(metric_to_exclude)) count = count + 1 selected_features = list( set(selected_features) - set([metric_to_exclude])) X_AS_train = X_AS_train.loc[:, selected_features] print('Finally, according to Part 2 of AutoSpearman,', X_AS_train.columns, 'are selected.') X_AS_train = X_AS_train.drop('const', axis=1) return X_AS_train
[docs]def get_base_prefix_compat(): """Get base/real prefix, or sys.prefix if there is none.""" return getattr(sys, "base_prefix", None) or getattr(sys, "real_prefix", None) or sys.prefix
[docs]def in_virtualenv(): return get_base_prefix_compat() != sys.prefix
INSIDE_VIRTUAL_ENV = in_virtualenv()
[docs]def load_sample_data(): this_dir, _ = os.path.split(__file__) path = this_dir + "/default_data/activemq-5.0.0.zip" if INSIDE_VIRTUAL_ENV: cwd = os.getcwd() path = cwd + '/pyexplainer/default_data/activemq-5.0.0.zip' return pd.read_csv(path)
[docs]def data_validation(data): """Validate that the given data format is a list of dictionary. Parameters ---------- data : :obj:`Any` Data to be validated. Returns ------- :obj:`bool` True: The data is a list of dictionary.\n False: The data is not a list of dictionary. """ valid = True if isinstance(data, list): for i in range(len(data)): if not isinstance(data[i], dict): print( "Data Format Error - the input data should be a list of dictionary") valid = False break else: valid = False return valid
[docs]def filter_rules(rules, X_explain): """Get rules that are actually applied to the commit Parameters ---------- rules : :obj:`pandas.core.frame.DataFrame` Rules data under the column called 'rule' inside Rules DF generated by RuleFit X_explain : :obj:`pandas.core.frame.DataFrame` Features to be explained by the local RuleFit model, can be seen as X_test Returns ------- :obj:`pandas.core.frame.DataFrame` A DataFrame that contains filtered rules """ def eval_rule(rule, x_df): """Parsing pattern of one rule (a single row under the rule column inside Rules DF generated by RuleFit) Note. Only support names of rules (feature names) that has CamelCase pattern with or without underscore e.g. 'ThisIsLegal_Name', 'this_is_legal_name', 'thisislegalname', 'ThisIsLegalName' Name like 'this is not a legal name', consider using '_' instead of ' ' to concatenate """ var_in_rule = list(set(re.findall('[a-z_*A-Z]+', rule))) rule = re.sub(r'\b=\b', '==', rule) if 'or' in var_in_rule: var_in_rule.remove('or') if 'e' in var_in_rule and 'e' not in x_df.columns: var_in_rule.remove('e') rule = rule.replace('&', 'and') eval_result_list = [] for i in range(0, len(x_df)): x = x_df.iloc[[i]] var_dict = {} for var in var_in_rule: var_dict[var] = float(x[var]) eval_result = eval(rule, var_dict) eval_result_list.append(eval_result) return eval_result_list # select rules that (1) have positive coefficient values and (2) have positive importance scores rules = rules[(rules['type'] == 'rule') & ( rules['coef'] > 0) & (rules['importance'] > 0)] rules_list = list(rules['rule']) rule_eval_result = [] # for each rule, check whether such rules apply to the actual instance to be explained # Note. # you may pass rules variable to eval_rule() to get the result of all rules # never try, but if it works please tell me. for r in rules_list: # X_explain must be a dataframe of 1 row py_exp_pred = eval_rule(r, X_explain)[0] rule_eval_result.append(py_exp_pred) new_col = {'is_satisfy_instance': rule_eval_result} df = pd.DataFrame(data=new_col) rules = pd.concat([rules, df], axis=1) # select rules that apply to the actual instance to be explained # Note. can't use 'is' as comparing operator because of pandas rules = rules.loc[rules['is_satisfy_instance'] == True] # sort rules according to their importance scores sorted_rules = rules.sort_values(by='importance', ascending=False, kind='mergesort') return sorted_rules
[docs]def get_dflt(): """Obtain the default data and model Returns ------- :obj:`dict` A dictionary wrapping all default data and model """ this_dir, _ = os.path.split(__file__) path_rf_model = this_dir + "/default_data/sample_model.pkl" path_X_train = this_dir + "/default_data/X_train.csv" path_y_train = this_dir + "/default_data/y_train.csv" path_X_explain = this_dir + "/default_data/X_explain.csv" path_y_explain = this_dir + "/default_data/y_explain.csv" if INSIDE_VIRTUAL_ENV: cwd = os.getcwd() path_rf_model = cwd + "/tests/default_data/sample_model.pkl" path_X_train = cwd + "/tests/default_data/X_train.csv" path_y_train = cwd + "/tests/default_data/y_train.csv" path_X_explain = cwd + "/tests/default_data/X_explain.csv" path_y_explain = cwd + "/tests/default_data/y_explain.csv" with open(path_rf_model, 'rb') as f: rf_model = pickle.load(f) X_train = pd.read_csv(path_X_train) X_train = X_train.drop(["File"], axis=1) y_train = pd.read_csv(path_y_train)["RealBug"] X_explain = pd.read_csv(path_X_explain) y_explain = pd.read_csv(path_y_explain)["RealBug"] full_ft_names = ['nCommit', 'AddedLOC', 'nCoupledClass', 'LOC', 'CommentToCodeRatio'] return {'X_train': X_train, 'y_train': y_train, 'indep': X_train.columns, 'dep': "RealBug", 'blackbox_model': rf_model, 'X_explain': X_explain, 'y_explain': y_explain, 'full_ft_names': full_ft_names}
[docs]def id_generator(size=15, random_state=check_random_state(None)): """Generate unique ids for div tag which will contain the visualisation stuff from d3. Parameters ---------- size : :obj:`int` An integer that specifies the length of the returned id, default = 15. Size should be ion range 1 - 30(both included) random_state : :obj:`np.random.RandomState`, default is None. A RandomState instance. Returns ------- :obj:`str` A random identifier. """ if not isinstance(size, int): size = 15 if size <= 0 or size > 30: size = 15 if not isinstance(random_state, np.random.mtrand.RandomState): random_state = check_random_state(None) chars = list(string.ascii_uppercase + string.digits) return ''.join(random_state.choice(chars, size, replace=True))
[docs]def to_js_data(list_of_dict): """Transform python list to a str to be used inside the html <script><script/> Parameters ---------- list_of_dict : :obj:`list` Data to be transformed. Returns ------- :obj:`str` A str to represent a list of dict ending with ';' """ if data_validation(list_of_dict): return str(list_of_dict) + ";" else: print( "Data to be transformed to the javascript format is not a python list of dict, hence '[{}];' is returned") return '[{}];'
[docs]class PyExplainer: """A PyExplainer object is able to load training data and an ML model to generate human-centric explanation and visualisation Parameters ---------- X_train : :obj:`pandas.core.frame.DataFrame` Training data X (Features) y_train : :obj:`pandas.core.series.Series` Training data y (Label) indep : :obj:`pandas.core.indexes.base.Index` independent variables (column names) dep : :obj:`str` dependent variables (column name) blackbox_model : :obj:`sklearn.ensemble.RandomForestClassifier` A global random forest model trained from sklearn class_label : :obj:`list` Classification labels, default = ['Clean', 'Defect'] top_k_rules : :obj:`int` Number of top positive and negative rules to be retrieved full_ft_names : :obj:`list` A list containing full feature names inside X_train """ def __init__(self, X_train, y_train, indep, dep, blackbox_model, class_label=['Clean', 'Defect'], top_k_rules=3, full_ft_names=[]): if isinstance(X_train, pd.core.frame.DataFrame): self.X_train = X_train else: print("X_train should be type 'pandas.core.frame.DataFrame'") raise TypeError if isinstance(y_train, pd.core.series.Series): self.y_train = y_train else: print("y_train should be type 'pandas.core.series.Series'") raise TypeError if isinstance(indep, pd.core.indexes.base.Index): self.indep = indep else: print( "indep (feature column names) should be type 'pandas.core.indexes.base.Index'") raise TypeError if isinstance(dep, str): self.dep = dep else: print("dep (label column name) should be type 'str'") raise TypeError all_clf = all_estimators(type_filter="classifier") supported_algo = [clf[1] for clf in all_clf] if type(blackbox_model) in supported_algo: self.blackbox_model = blackbox_model else: print("The blackbox_model should be a classifier provided by sklearn)") raise TypeError if isinstance(class_label, list): if len(class_label) == 2: self.class_label = class_label else: print("class_label should be a list with length of 2") raise ValueError else: print("class_label should be type 'list'") raise TypeError if isinstance(top_k_rules, int): if top_k_rules <= 0 or top_k_rules > 15: print("top_k_rules should be in range 1 - 15 (both included)") raise ValueError else: self.top_k_rules = top_k_rules else: print("top_k_rules should be type 'int'") raise TypeError if full_ft_names: short_ft_names = X_train.columns.to_list() # length of short ft names and full ft names should be the same if len(short_ft_names) != len(full_ft_names): print( "list of short feature names and list of full feature names should have the same length!") raise ValueError self.full_ft_names = dict(zip(short_ft_names, full_ft_names)) else: self.full_ft_names = {} self.bullet_data = [{}] self.risk_data = [{}] self.bullet_output = widgets.Output( layout={'border': '3px solid black'}) self.hbox_items = [] self.X_explain = None self.y_explain = None self.visualisation_title = None self.local_model = None
[docs] def auto_spearman(self, apply_to_X_train=True, correlation_threshold=0.7, correlation_method='spearman', VIF_threshold=5): """An automated feature selection approach that address collinearity and multicollinearity. For more information, please kindly refer to the `paper <https://ieeexplore.ieee.org/document/8530020>`_. Parameters ---------- apply_to_X_train : :obj:`bool` Whether to apply the selected columns to the X_train data inside PyExplainer Obj., default is True correlation_threshold : :obj:`float` Threshold value of correalation. correlation_method : :obj:`str` Method for solving the correlation between the features. VIF_threshold : :obj:`int` Threshold value of VIF score. """ X_AS_train = AutoSpearman( self.X_train, correlation_threshold, correlation_method, VIF_threshold) if apply_to_X_train: self.set_X_train(X_AS_train) # if there is data of full feature names if self.get_full_ft_names(): full_ft_names = self.get_full_ft_names() new_full_ft_names = {} for key in X_AS_train.columns.to_list(): new_full_ft_names[key] = full_ft_names[key] self.set_full_ft_names(new_full_ft_names) return print('X_train data inside PyExplainer was updated based on the selected features above') else: return X_AS_train
[docs] def explain(self, X_explain, y_explain, top_k=3, max_rules=2000, max_iter=10000, cv=5, search_function='CrossoverInterpolation', random_state=None, reuse_local_model=False): """Generate Rule Object Manually by passing X_explain and y_explain Parameters ---------- X_explain : :obj:`pandas.core.frame.DataFrame` Features to be explained by the local RuleFit model, can be seen as X_test y_explain : :obj:`pandas.core.series.Series` Label to be explained by the local RuleFit model, can be seen as y_test top_k : :obj:`int`, default is 3 Number of top rules to be retrieved max_rules : :obj:`int`, default is 10 Number of maximum rules to be generated max_iter : :obj:`int`, default is 10 Maximum number of iteration to be tuned in to the local RuleFit model cv : :obj:`int`, default is 5 Cross Validation to be tuned in to the local RuleFit model search_function : :obj:`str`, default is 'crossoverinterpolation' Name of the search function to be used to generate the instance used by RuleFit.fit() random_state : :obj:`int`, default is None Random seed for reproducing the same result reuse_local_model : :obj:`bool`, default is False Reproduce the same explanation for the same data Returns ------- :obj:`dict` A dict rule object including all of the data related to the local RuleFit model with the following keys, 'synthetic_data', 'synthetic_predictions', 'X_explain', 'y_explain', 'indep', 'dep', 'top_k_positive_rules', 'top_k_negative_rules'. Examples -------- >>> from pyexplainer.pyexplainer_pyexplainer import PyExplainer >>> import pandas as pd >>> from sklearn.ensemble import RandomForestClassifier >>> data = pd.read_csv('../tests/pyexplainer_test_data/activemq-5.0.0.csv', index_col = 'File') >>> dep = data.columns[-4] >>> indep = data.columns[0:(len(data.columns) - 4)] >>> X_train = data.loc[:, indep] >>> y_train = data.loc[:, dep] >>> blackbox_model = RandomForestClassifier(max_depth=3, random_state=0) >>> blackbox_model.fit(X_train, y_train) >>> class_label = ['Clean', 'Defect'] >>> py_explainer = PyExplainer(X_train, y_train, indep, dep, class_label, blackbox_model) >>> sample_test_data = pd.read_csv('../tests/pyexplainer_test_data/activemq-5.0.0.csv', index_col='File') >>> X_test = sample_test_data.loc[:, indep] >>> y_test = sample_test_data.loc[:, dep] >>> sample_explain_index = 0 >>> X_explain = X_test.iloc[[sample_explain_index]] >>> y_explain = y_test.iloc[[sample_explain_index]] >>> py_explainer.explain(X_explain, y_explain, search_function='crossoverinterpolation', top_k=3, max_rules=30, max_iter=5, cv=5) """ # check if X_explain is a DF if not isinstance(X_explain, pd.core.frame.DataFrame): print("X_explain should be type 'pandas.core.frame.DataFrame'") raise TypeError # check if X_explain has the same num of cols as X_train if len(X_explain.columns) != len(self.X_train.columns): print("X_explain should have the same number of columns as X_train") raise ValueError # check if y_explain is a Series if not isinstance(y_explain, pd.core.series.Series): print("y_explain should be type 'pandas.core.series.Series'") raise TypeError self.set_top_k_rules(top_k) # Step 1 - Generate synthetic instances if search_function.lower() == 'crossoverinterpolation': synthetic_object = self.generate_instance_crossover_interpolation( X_explain, y_explain, random_state=random_state) elif search_function.lower() == 'randomperturbation': # This random perturbation approach to generate instances is used by LIME to gerate synthetic instances synthetic_object = self.generate_instance_random_perturbation( X_explain=X_explain) # Step 2 - Generate predictions of synthetic instances using the global model synthetic_instances = synthetic_object['synthetic_data'].loc[:, self.indep] synthetic_predictions = self.blackbox_model.predict( synthetic_instances) if 1 in synthetic_predictions and 0 in synthetic_predictions: one_class_problem = False else: one_class_problem = True if one_class_problem: print("""Random Perturbation only generated one class for the prediction column which means Random Perturbation is not compatible with the current data. The 'Crossover and Interpolation' approach is used as the alternative.""") synthetic_object = self.generate_instance_crossover_interpolation( X_explain, y_explain) synthetic_instances = synthetic_object['synthetic_data'].loc[:, self.indep] synthetic_predictions = self.blackbox_model.predict( synthetic_instances) # Step 3 - Build a RuleFit local model with synthetic instances if reuse_local_model and self.local_model: local_rulefit_model = self.local_model else: local_rulefit_model = RuleFit(rfmode='classify', exp_rand_tree_size=False, random_state=random_state, max_rules=max_rules, cv=cv, max_iter=max_iter, n_jobs=-1) local_rulefit_model.fit(synthetic_instances.values, synthetic_predictions, feature_names=self.indep) self.local_model = local_rulefit_model # Step 4 Get rules from the RuleFit local model rules = local_rulefit_model.get_rules() rules = rules[rules.coef != 0].sort_values("importance", ascending=False, kind='mergesort') rules = rules[rules.type == 'rule'].sort_values("importance", ascending=False, kind='mergesort') positive_filtered_rules = filter_rules(rules, X_explain) # positive rules top_k_positive_rules = positive_filtered_rules.loc[positive_filtered_rules['coef'] > 0] \ .sort_values("importance", ascending=False, kind='mergesort').head(top_k) top_k_positive_rules['Class'] = self.class_label[1] top_k_positive_rules = positive_filtered_rules.reset_index() # filter out nan values top_k_positive_rules = top_k_positive_rules.dropna() # negative rules top_k_negative_rules = rules.loc[rules['coef'] < 0] \ .sort_values("importance", ascending=False, kind='mergesort').head(top_k) top_k_negative_rules['Class'] = self.class_label[0] # filter out nan values top_k_negative_rules = top_k_negative_rules.dropna() rule_obj = {'synthetic_data': synthetic_instances, 'synthetic_predictions': synthetic_predictions, 'X_explain': X_explain, 'y_explain': y_explain, 'indep': self.indep, 'dep': self.dep, 'top_k_positive_rules': top_k_positive_rules, 'top_k_negative_rules': top_k_negative_rules, 'local_rulefit_model': local_rulefit_model} return rule_obj
[docs] def generate_bullet_data(self, parsed_rule_object): """Generate bullet chart data (a list of dict) to be implemented with d3.js chart. Parameters ---------- parsed_rule_object : :obj:`dict` Top rules parsed from Rule object. Returns ------- :obj:`list` A list of dict that contains the data needed to generate a bullet chart. """ X_explain = self.__get_X_explain() min_max_values = self.retrieve_X_explain_min_max_values() # Version 01 - only visualise for what to follow (Rules => Clean) bullet_data = [] for i in range(len(parsed_rule_object['top_tofollow_rules'])): # sample data of tmp_rule # {'variable': 'MAJOR_COMMIT', 'lessthan': True, 'value': '1.550000011920929'} tmp_rule = parsed_rule_object['top_tofollow_rules'][i] tmp_actual_value = round(X_explain[tmp_rule['variable']][0], 2) separation_point = float(tmp_rule['value']) if tmp_actual_value < 0 or separation_point < 0: print("""actual value of %s < 0, currently do not support this type of rule""" % tmp_rule['variable']) tmp_markers = [tmp_actual_value] # currently, always place marker in the middle of bullet bars plot_min = 0 diff_actual_min = abs(tmp_actual_value - plot_min) plot_max = tmp_actual_value + diff_actual_min if separation_point > plot_max: plot_max += separation_point - plot_max plot_max += diff_actual_min diff_plot_max_min = plot_max - plot_min tmp_subtitle_text = 'Actual = ' + str(tmp_actual_value) tmp_ticks = [plot_min, plot_max] if diff_plot_max_min <= 1: tmp_step = [0.01] elif diff_plot_max_min >= 1000: tmp_step = [10] elif diff_plot_max_min <= 3: tmp_step = [0.1] else: tmp_step = [1] bullet_total_width = 450 tmp_start_points = [0, round((separation_point / diff_plot_max_min if diff_plot_max_min else 0) * bullet_total_width, 4)] tmp_widths = [round(tmp_start_points[1] - plot_min, 4), round(bullet_total_width - tmp_start_points[1], 4)] title_id = '#' + str(i + 1) var_name = str(tmp_rule['variable']) var_ref = var_name # todo - use get # check if there is mapping for full feature names if self.full_ft_names: # todo - use get full_ft_names = self.full_ft_names var_name = full_ft_names[var_name] if tmp_rule['lessthan']: # The rule suggest to decrease the values to less than a certain threshold tmp_title_text = title_id + ' The value of ' + \ var_name + ' is more than ' + \ str(tmp_actual_value) tmp_colors = ["#a6d96a", "#d7191c"] else: # lessthan == FALSE: # The rule suggest to increase the values to more than a certain threshold tmp_title_text = title_id + ' The value of ' + \ var_name + ' is less than ' + \ str(tmp_actual_value) tmp_colors = ["#d7191c", "#a6d96a"] bullet_data.append({ "title": tmp_title_text, "subtitle": tmp_subtitle_text, "ticks": tmp_ticks, "step": tmp_step, "startPoints": tmp_start_points, "widths": tmp_widths, "colors": tmp_colors, "markers": tmp_markers, "varRef": var_ref, }) return bullet_data
[docs] def generate_html(self): """Generate d3 bullet chart html and return it as a String. Returns ---------- :obj:`str` html String """ this_dir, _ = os.path.split(__file__) with open(os.path.join(this_dir, 'css/styles.css'), encoding="utf8") as f: style_css = f.read() with open(os.path.join(this_dir, 'js/d3.min.js'), encoding="utf8") as f: d3_js = f.read() with open(os.path.join(this_dir, 'js/bullet.js'), encoding="utf8") as f: bullet_js = f.read() css_stylesheet = """ <style>%s</style> """ % style_css d3_script = """ <script>%s</script> <script>%s</script> """ % (d3_js, bullet_js) if self.visualisation_title: main_title = self.visualisation_title else: main_title = "Why the model generate such prediction for a given test instance?" title = """ <div style="position: relative; top: 0; width: 100vw; left: 20vw;"> <b>%s</b> </div> """ % main_title unique_id = id_generator() bullet_data = to_js_data(self.__get_bullet_data()) d3_operation_script = """ <script> var margin = { top: 5, right: 40, bottom: 20, left: 500 }, width = 990 - margin.left - margin.right, height = 50 - margin.top - margin.bottom; var chart = d3.bullet().width(width).height(height); var bulletData = %s var svg = d3 .select("#d3-target-bullet-%s") .selectAll("svg") .data(bulletData) .enter() .append("svg") .attr("class", "bullet") .attr("width", width + margin.left + margin.right) .attr("height", height + margin.top + margin.bottom) .append("g") .attr( "transform", "translate(" + margin.left + "," + margin.top + ")" ) .call(chart); var title = svg .append("g") .style("text-anchor", "end") .attr("transform", "translate(-6," + height / 2 + ")"); title .append("text") .attr("class", "title") .text((d) => d.title); title .append("text") .attr("class", "subtitle") .attr("dy", "1em") .text((d) => d.subtitle); </script> """ % (bullet_data, unique_id) html = """ <!DOCTYPE html> <html> <meta http-equiv="content-type" content="text/html; charset=UTF8"> <head> %s %s </head> <body> <div class="bullet-chart"> %s <div class="d3-target-bullet" id="d3-target-bullet-%s" /> </div> %s </body> </html> """ % (css_stylesheet, d3_script, title, unique_id, d3_operation_script) return html
[docs] def generate_instance_crossover_interpolation(self, X_explain, y_explain, random_state=None, debug=False): """An approach to generate instance using Crossover and Interpolation Parameters ---------- X_explain : :obj:`pandas.core.frame.DataFrame` X_explain (Testing Features) y_explain : :obj:`pandas.core.series.Series` y_explain (Testing Label) random_state : :obj:`int` Random Seed debug : :obj:`bool` True for debugging mode, False otherwise. Returns ------- :obj:`dict` A dict with two keys 'synthetic_data' and 'sampled_class_frequency' generated via Crossover and Interpolation. """ # categorical_vars = [] X_train_i = self.X_train.copy() # y_train_i = self.y_train.copy() X_explain = X_explain.copy() y_explain = y_explain.copy() X_train_i.reset_index(inplace=True) X_explain.reset_index(inplace=True) X_train_i = X_train_i.loc[:, self.indep] # y_train_i = y_train_i.reset_index()[[self.dep]] X_explain = X_explain.loc[:, self.indep] y_explain = y_explain.reset_index()[[self.dep]] # get the global model predictions for the training set target_train = self.blackbox_model.predict(X_train_i) # class variables # ori_dataset = pd.concat([X_train_i.reset_index(drop=True), y_train_i], axis=1) # Do feature scaling for continuous data and one hot encoding for categorical data scaler = StandardScaler() trainset_normalize = X_train_i.copy() if debug: print(list(X_train_i), "columns") cases_normalize = X_explain.copy() train_objs_num = len(trainset_normalize) dataset = pd.concat(objs=[trainset_normalize, cases_normalize], axis=0) if debug: print(self.indep, "continuous") print(type(self.indep)) dataset[self.indep] = scaler.fit_transform(dataset[self.indep]) # dataset = pd.get_dummies(dataset, prefix_sep="__", columns=self.__categorical_vars) trainset_normalize = copy.copy(dataset[:train_objs_num]) cases_normalize = copy.copy(dataset[train_objs_num:]) # make dataframe to store similarities of the trained instances from the explained instance dist_df = pd.DataFrame(index=trainset_normalize.index.copy()) width = math.sqrt(len(X_train_i.columns)) * 0.75 # similarity for count, case in cases_normalize.iterrows(): # Calculate the euclidean distance from the instance to be explained dist = np.linalg.norm( trainset_normalize.sub(np.array(case)), axis=1) # Convert distance to a similarity score similarity = np.exp(-(dist ** 2) / (2 * (width ** 2))) dist_df['dist'] = similarity dist_df['t_target'] = target_train # get the unique classes of the training set unique_classes = dist_df.t_target.unique() # Sort similarity scores in to descending order dist_df.sort_values(by=['dist'], ascending=False, inplace=True, kind='mergesort') # dist_df.reset_index(inplace=True) # Make a dataframe with top 40 elements in each class top_fourty_df = pd.DataFrame([]) for clz in unique_classes: top_fourty_df = top_fourty_df.append( dist_df[dist_df['t_target'] == clz].head(40)) # top_fourty_df.reset_index(inplace=True) # get the minimum value of the top 40 elements and return the index cutoff_similarity = top_fourty_df.nsmallest( 1, 'dist', keep='last').index.values.astype(int)[0] # Get the location for the given index with the minimum similarity min_loc = dist_df.index.get_loc(cutoff_similarity) # whole neighbourhood without undersampling the majority class train_neigh_sampling_b = dist_df.iloc[0:min_loc + 1] # get the size of neighbourhood for each class target_details = train_neigh_sampling_b.groupby( ['t_target']).size() if debug: print(target_details, "target_details") target_details_df = pd.DataFrame( {'target': target_details.index, 'target_count': target_details.values}) # Get the majority class and undersample final_neighbours_similarity_df = pd.DataFrame([]) for index, row in target_details_df.iterrows(): if row["target_count"] > 200: filterd_class_set = train_neigh_sampling_b \ .loc[train_neigh_sampling_b['t_target'] == row['target']] \ .sample(n=200, random_state=random_state) final_neighbours_similarity_df = final_neighbours_similarity_df.append( filterd_class_set) else: filterd_class_set = train_neigh_sampling_b \ .loc[train_neigh_sampling_b['t_target'] == row['target']] final_neighbours_similarity_df = final_neighbours_similarity_df.append( filterd_class_set) if debug: print(final_neighbours_similarity_df, "final_neighbours_similarity_df") # Get the original training set instances which is equal to the index of the selected neighbours train_set_neigh = X_train_i[X_train_i.index.isin( final_neighbours_similarity_df.index)] if debug: print(train_set_neigh, "train set neigh") train_class_neigh = y_explain[y_explain.index.isin( final_neighbours_similarity_df.index)] # train_neigh_df = train_set_neigh.join(train_class_neigh) # class_neigh = train_class_neigh.groupby([self.dep]).size() new_con_df = pd.DataFrame([]) sample_classes_arr = [] sample_indexes_list = [] # Generating instances using the cross-over technique for num in range(0, 1000): rand_rows = train_set_neigh.sample(2, random_state=random_state) sample_indexes_list = sample_indexes_list + rand_rows.index.values.tolist() # similarity_both = dist_df[dist_df.index.isin(rand_rows.index)] sample_classes = train_class_neigh[train_class_neigh.index.isin( rand_rows.index)] sample_classes = np.array( sample_classes.to_records().view(type=np.matrix)) sample_classes_arr.append(sample_classes[0].tolist()) alpha_n = np.random.uniform(low=0, high=1.0) x = rand_rows.iloc[0] y = rand_rows.iloc[1] new_ins = x + (y - x) * alpha_n new_ins = new_ins.to_frame().T """ # For Categorical Variables for cat in categorical_vars: x_df = x.to_frame().T y_df = y.to_frame().T # Check similarity of x > similarity of y if similarity_both.iloc[0]['dist'] > similarity_both.iloc[1]['dist']: new_ins[cat] = x_df.iloc[0][cat] # Check similarity of y > similarity of x elif similarity_both.iloc[0]['dist'] < similarity_both.iloc[1]['dist']: new_ins[cat] = y_df.iloc[0][cat] else: new_ins[cat] = random.choice([x_df.iloc[0][cat], y_df.iloc[0][cat]]) """ new_ins.name = num new_con_df = new_con_df.append(new_ins, ignore_index=True) # Generating instances using the mutation technique for num in range(1000, 2000): rand_rows = train_set_neigh.sample(3, random_state=random_state) sample_indexes_list = sample_indexes_list + rand_rows.index.values.tolist() sample_classes = train_class_neigh[train_class_neigh.index.isin( rand_rows.index)] sample_classes = np.array( sample_classes.to_records().view(type=np.matrix)) sample_classes_arr.append(sample_classes[0].tolist()) mu_f = np.random.uniform(low=0.5, high=1.0) x = rand_rows.iloc[0] y = rand_rows.iloc[1] z = rand_rows.iloc[2] new_ins = x + (y - z) * mu_f new_ins = new_ins.to_frame().T """ # For Categorical Variables get the value of the closest instance to the explained instance for cat in categorical_vars: x_df = x.to_frame().T y_df = y.to_frame().T z_df = z.to_frame().T new_ins[cat] = random.choice([x_df.iloc[0][cat], y_df.iloc[0][cat], z_df.iloc[0][cat]]) """ new_ins.name = num new_con_df = new_con_df.append(new_ins, ignore_index=True) # get the global model predictions of the generated instances and the instances in the neighbourhood predict_dataset = train_set_neigh.append( new_con_df, ignore_index=True) target = self.blackbox_model.predict(predict_dataset) target_df = pd.DataFrame(target) # neighbor_frequency = Counter(tuple(sorted(entry)) for entry in sample_classes_arr) new_df_case = pd.concat([predict_dataset, target_df], axis=1) new_df_case = np.round(new_df_case, 2) new_df_case.rename(columns={0: y_explain.columns[0]}, inplace=True) sampled_class_frequency = new_df_case.groupby([self.dep]).size() return {'synthetic_data': new_df_case, 'sampled_class_frequency': sampled_class_frequency}
[docs] def generate_instance_random_perturbation(self, X_explain, debug=False): """The random perturbation approach to generate synthetic instances which is also used by LIME. Parameters ---------- X_explain : :obj:`pandas.core.frame.DataFrame` X_explain (Testing Features) debug : :obj:`bool` True for debugging mode, False otherwise. Returns ------- :obj:`dict` A dict with two keys 'synthetic_data' and 'sampled_class_frequency' generated via Random Perturbation. """ random_seed = 0 data_row = X_explain.loc[:, self.indep].values num_samples = 1000 sampling_method = 'gaussian' discretizer = None sample_around_instance = True scaler = sklearn.preprocessing.StandardScaler(with_mean=False) scaler.fit(self.X_train.loc[:, self.indep]) # distance_metric = 'euclidean' random_state = check_random_state(random_seed) is_sparse = sp.sparse.issparse(data_row) if is_sparse: num_cols = data_row.shape[1] data = sp.sparse.csr_matrix( (num_samples, num_cols), dtype=data_row.dtype) else: num_cols = data_row.shape[0] data = np.zeros((num_samples, num_cols)) if discretizer is None: instance_sample = data_row scale = scaler.scale_ # mean = scaler.mean_ if is_sparse: # Perturb only the non-zero values non_zero_indexes = data_row.nonzero()[1] num_cols = len(non_zero_indexes) instance_sample = data_row[:, non_zero_indexes] scale = scale[non_zero_indexes] # mean = mean[non_zero_indexes] if sampling_method == 'gaussian': data = random_state.normal( 0, 1, num_samples * num_cols).reshape(num_samples, num_cols) data = np.array(data) else: warnings.warn('''Invalid input for sampling_method. Defaulting to Gaussian sampling.''', UserWarning) data = random_state.normal( 0, 1, num_samples * num_cols).reshape(num_samples, num_cols) data = np.array(data) if sample_around_instance: data = data * scale + instance_sample # else: # data = data * scale + mean if is_sparse: if num_cols == 0: data = sp.sparse.csr_matrix( (num_samples, data_row.shape[1]), dtype=data_row.dtype) else: indexes = np.tile(non_zero_indexes, num_samples) indptr = np.array( range(0, len(non_zero_indexes) * (num_samples + 1), len(non_zero_indexes))) data_1d_shape = data.shape[0] * data.shape[1] data_1d = data.reshape(data_1d_shape) data = sp.sparse.csr_matrix( (data_1d, indexes, indptr), shape=(num_samples, data_row.shape[1])) # first_row = data_row # else: # first_row = discretizer.discretize(data_row) data[0] = data_row.copy() inverse = data.copy() # todo - this for-loop is for categorical columns in the future """ for column in categorical_features: values = feature_values[column] freqs = feature_frequencies[column] inverse_column = random_state.choice(values, size=num_samples, replace=True, p=freqs) binary_column = (inverse_column == first_row[column]).astype(int) binary_column[0] = 1 inverse_column[0] = data[0, column] data[:, column] = binary_column inverse[:, column] = inverse_column """ # if discretizer is not None: # inverse[1:] = discretizer.undiscretize(inverse[1:]) inverse[0] = data_row if sp.sparse.issparse(data): # Note in sparse case we don't subtract mean since data would become dense scaled_data = data.multiply(scaler.scale_) # Multiplying with csr matrix can return a coo sparse matrix if not sp.sparse.isspmatrix_csr(scaled_data): scaled_data = scaled_data.tocsr() else: scaled_data = (data - scaler.mean_) / scaler.scale_ # distances = sklearn.metrics.pairwise_distances(scaled_data, # scaled_data[0].reshape(1, -1), # metric=distance_metric).ravel() new_df_case = pd.DataFrame(data=scaled_data, columns=self.indep) sampled_class_frequency = 0 n_defect_class = np.sum(self.blackbox_model.predict( new_df_case.loc[:, self.indep])) if debug: print('Random seed', random_seed, 'nDefective', n_defect_class) return {'synthetic_data': new_df_case, 'sampled_class_frequency': sampled_class_frequency}
[docs] def generate_risk_data(self, X_explain): """Generate risk prediction and risk score to be visualised Parameters ---------- X_explain : :obj:`pandas.core.frame.DataFrame` Explained Dataframe generated from RuleFit model. Returns ------- :obj:`list` A list of dict that contains the data of risk prediction and risk score. """ risk_pred = int(self.blackbox_model.predict(X_explain)[0]) return [{"riskScore": [str(int(round(self.blackbox_model.predict_proba(X_explain)[0][1] * 100, 0))) + '%'], "riskPred": [self.class_label[risk_pred]] }]
[docs] def get_full_ft_names(self): """getter of self.full_ft_names Returns ---------- :obj:`list` A list of full feature names in X_train following the same order as X_train """ return self.full_ft_names
[docs] def get_risk_pred(self): """Retrieve the risk prediction from risk_data Returns ---------- :obj:`str` A string of risk prediction """ return self.__get_risk_data()[0]['riskPred'][0]
[docs] def get_risk_score(self): """Retrieve the risk score from risk_data Returns ---------- :obj:`float` A float of risk score """ risk_score = self.__get_risk_data()[0]['riskScore'][0].strip("%") return float(risk_score)
[docs] def get_top_k_rules(self): """Getter of top_k_rules Returns ---------- :obj:`int` Number of top positive and negative rules to be retrieved """ return self.top_k_rules
[docs] def generate_progress_bar_items(self): """Generate items to be set into hbox (horizontal box) """ progress_bar = widgets.FloatProgress(value=0, min=0, max=100, bar_style='info', layout=widgets.Layout( width='40%'), orientation='horizontal') left_text = widgets.Label("Risk Score: ") right_text = widgets.Label("0") self.__set_hbox_items( [left_text, progress_bar, right_text, widgets.Label("%")])
[docs] def generate_sliders(self): """Generate one or more slider widgets and return as a list. Slider would be either IntSlider or FloatSlider depending on the value in the data Returns ------- :obj:`list` A list of slider widgets. """ slider_widgets = [] data = self.__get_bullet_data() style = {'description_width': '40%'} layout = widgets.Layout(width='99%', height='20px') for d in data: # decide to use either IntSlider or FloatSlider if isinstance(d['step'], int): # create IntSlider obj and store it into a list slider = widgets.IntSlider( value=d['markers'][0], min=d['ticks'][0], max=d['ticks'][-1], step=d['step'][0], description=d['title'], layout=layout, style=style, disabled=False, continuous_update=False, orientation='horizontal', readout=True, readout_format='d' ) slider_widgets.append(slider) else: # create FloatSlider obj and store it into a list slider = widgets.FloatSlider( value=d['markers'][0], min=d['ticks'][0], max=d['ticks'][-1], step=d['step'][0], description=d['title'], layout=layout, style=style, disabled=False, continuous_update=False, orientation='horizontal', readout=True, readout_format='.1f' ) slider_widgets.append(slider) return slider_widgets
[docs] def on_value_change(self, change, debug=False): """The callback function for the interactive slider Whenever the user interacts with the slider, If the slider is in the non-continuous update mode, only if the mouse click is released, this callback will be triggered. If the slider is in the continuous update mode (not recommended here), this function will be triggered continuously when the user is moving the slider. This callback will first clear the output of Risk Score Progress Bar and the Bullet Chart. Then it will call funcs to compute the new values to be visualised. When the computing is done, it will soon visualise the new value. Parameters ---------- change : :obj:`dict` A dict that contains the former(before changing) and later(after changing) data inside the slider """ # step 1 - clear the bullet chart output and risk score bar output bullet_out = self.bullet_output bullet_out.clear_output() # step 2 - compute new values to be visualised # get var changed bullet_data = self.__get_bullet_data() id = int(change['owner'].description.split(" ")[0].strip("#")) var_changed = bullet_data[id - 1]['varRef'] if debug: new_value = change['new'] else: new_value = change.new # modify changed var in X_explain X_explain = self.__get_X_explain() row_name = self.__get_X_explain().index[0] X_explain.at[row_name, var_changed] = new_value # modify bullet data bullet_data[id - 1]['markers'][0] = new_value self.__set_bullet_data(bullet_data) # generate new risk data self.__set_risk_data(self.generate_risk_data(X_explain)) # step 3 - visualise new output # update risk score progress bar self.run_bar_animation() # update bullet chart with bullet_out: # display d3 bullet chart html = self.generate_html() display(HTML(html))
[docs] def parse_top_rules(self, top_k_positive_rules, top_k_negative_rules): """Parse top k positive rules and top k negative rules given positive and negative rules as DataFrame Parameters ---------- top_k_positive_rules : :obj:`pandas.core.frame.DataFrame` Top positive rules DataFrame top_k_negative_rules : :obj:`pandas.core.frame.DataFrame` Top negative rules DataFrame Returns ------- :obj:`dict` A dict containing two keys, 'top_tofollow_rules' and 'top_toavoid_rules' """ smaller_top_rule = min( [len(top_k_positive_rules), len(top_k_negative_rules)]) if self.get_top_k_rules() > smaller_top_rule: self.set_top_k_rules(smaller_top_rule) top_variables = [] top_k_toavoid_rules = [] top_k_tofollow_rules = [] for i in range(len(top_k_positive_rules)): tmp_rule = (top_k_positive_rules['rule'].iloc[i]) tmp_rule = tmp_rule.strip() tmp_rule = str.split(tmp_rule, '&') for j in tmp_rule: j = j.strip() tmp_sub_rule = str.split(j, ' ') tmp_variable = tmp_sub_rule[0] tmp_condition_variable = tmp_sub_rule[1] tmp_value = tmp_sub_rule[2] if tmp_variable not in top_variables: top_variables.append(tmp_variable) top_k_toavoid_rules.append({'variable': tmp_variable, 'lessthan': tmp_condition_variable[0] == '<', 'value': tmp_value}) if len(top_k_toavoid_rules) == self.get_top_k_rules(): break if len(top_k_toavoid_rules) == self.get_top_k_rules(): break for i in range(len(top_k_negative_rules)): tmp_rule = (top_k_negative_rules['rule'].iloc[i]) tmp_rule = tmp_rule.strip() tmp_rule = str.split(tmp_rule, '&') for j in tmp_rule: j = j.strip() tmp_sub_rule = str.split(j, ' ') tmp_variable = tmp_sub_rule[0] tmp_condition_variable = tmp_sub_rule[1] tmp_value = tmp_sub_rule[2] if tmp_variable not in top_variables: top_variables.append(tmp_variable) top_k_tofollow_rules.append({'variable': tmp_variable, 'lessthan': tmp_condition_variable[0] == '<', 'value': tmp_value}) if len(top_k_tofollow_rules) == self.get_top_k_rules(): break if len(top_k_tofollow_rules) == self.get_top_k_rules(): break if top_k_tofollow_rules == []: print("PyExplainer can not find rules to follow!") print("This could lead to blank explanation UI!") print("Please check whether the global model is properly trained with sufficient training data.") if top_k_toavoid_rules == []: print("PyExplainer can not find rules to avoid!") print("This could lead to blank explanation UI!") print("Please check whether the global model is properly trained with sufficient training data.") return {'top_tofollow_rules': top_k_tofollow_rules, 'top_toavoid_rules': top_k_toavoid_rules}
[docs] def retrieve_X_explain_min_max_values(self): """Retrieve the minimum and maximum value from X_train Returns ------- :obj:`dict` A dict containing two keys, 'min_values' and 'max_values' """ min_values = self.X_train.min() max_values = self.X_train.max() return {'min_values': min_values, 'max_values': max_values}
[docs] def run_bar_animation(self): """Run the animation of Risk Score Progress Bar """ import time items_in_hbox = self.__get_hbox_items() progress_bar = items_in_hbox[1] risk_score = self.get_risk_score() risk_prediction = True if self.get_risk_pred().upper() == self.class_label[0].upper(): risk_prediction = False if risk_prediction: progress_bar.style = {'bar_color': '#FA8128'} else: progress_bar.style = {'bar_color': '#00FF00'} # play speed of the animation play_speed = 1 # progress bar animation # count start from the current val of the progress bar progress_bar.value = 0 count = progress_bar.value right_text = items_in_hbox[2] while count < risk_score: progress_bar.value += play_speed # signal to increment the progress bar new_progress_value = float(right_text.value) + play_speed if new_progress_value > risk_score: right_text.value = str(risk_score) else: right_text.value = str(new_progress_value) time.sleep(.01) count += play_speed # update the right text self.update_right_text(right_text)
[docs] def set_full_ft_names(self, full_ft_names): """Setter of full_ft_names Parameters ---------- full_ft_names : :obj:`list` A list of full feature names in X_train following the same order as X_train """ self.full_ft_names = full_ft_names
[docs] def set_top_k_rules(self, top_k_rules): """Setter of top_k_rules Parameters ---------- top_k_rules : :obj:`int` Number of top positive and negative rules to be retrieved """ if top_k_rules <= 0 or top_k_rules > 15 or isinstance(top_k_rules, int) == False: return print("set top_k_rules failed, top_k_rules should be int in range 1 - 15 (both included)") else: self.top_k_rules = top_k_rules
[docs] def set_X_train(self, X_train): """Setter of X_train Parameters ---------- X_train : :obj:`pandas.core.frame.DataFrame` X_train data """ if isinstance(X_train, pd.core.frame.DataFrame): self.X_train = X_train else: return print("set X_train failed, X_train should be type of pandas.core.frame.DataFrame!")
[docs] def show_visualisation(self, title): """Display items as follows, (1) Risk Score Progress Bar (made from ipywidgets) (2) Interactive Slider (made from ipywidgets) (3) Bullet Chart (Generated By D3.js) """ # set title self.visualisation_title = title # display risk score progress bar self.generate_progress_bar_items() items = self.__get_hbox_items() display(widgets.HBox(items)) self.run_bar_animation() bullet_out = self.bullet_output bullet_out.clear_output() display(bullet_out) with bullet_out: # display d3 bullet chart html = self.generate_html() display(HTML(html)) # display sliders sliders = self.generate_sliders() for slider in sliders: slider.observe(self.on_value_change, names='value') display(slider)
[docs] def update_risk_score(self, risk_score): """Update the risk score value inside the risk_data Parameters ---------- risk_score : :obj:`int` Value of risk score """ risk_score = str(risk_score) + '%' self.__get_risk_data()[0]['riskScore'][0] = risk_score
[docs] def update_right_text(self, right_text): """Update the text on the rightward side of the Risk Score Progress Bar Parameters ---------- right_text : :obj:`widgets.Label` Text on the rightward side of the Risk Score Progress Bar """ if isinstance(right_text, widgets.Label): self.__get_hbox_items()[2] = right_text else: print( "The right_text to be set into hbox_items should be type 'ipywidgets.Label'") raise TypeError
[docs] def visualise(self, rule_obj, title=None): """Given the rule object, show all of the visualisation as follows . (1) Risk Score Progress Bar (made from ipywidgets) (2) Interactive Slider (made from ipywidgets) (3) Bullet Chart (Generated By D3.js) Parameters ---------- rule_obj : :obj:`dict` A rule dict generated either through loading the .pyobject file or the .explain(...) function Examples -------- >>> from pyexplainer.pyexplainer_pyexplainer import PyExplainer >>> import pandas as pd >>> from sklearn.ensemble import RandomForestClassifier >>> data = pd.read_csv('../tests/pyexplainer_test_data/activemq-5.0.0.csv', index_col = 'File') >>> dep = data.columns[-4] >>> indep = data.columns[0:(len(data.columns) - 4)] >>> X_train = data.loc[:, indep] >>> y_train = data.loc[:, dep] >>> blackbox_model = RandomForestClassifier(max_depth=3, random_state=0) >>> blackbox_model.fit(X_train, y_train) >>> class_label = ['Clean', 'Defect'] >>> pyExp = PyExplainer(X_train, y_train, indep, dep, class_label, blackbox_model) >>> sample_test_data = pd.read_csv('../tests/pyexplainer_test_data/activemq-5.0.0.csv', index_col = 'File') >>> X_test = sample_test_data.loc[:, indep] >>> y_test = sample_test_data.loc[:, dep] >>> sample_explain_index = 0 >>> X_explain = X_test.iloc[[sample_explain_index]] >>> y_explain = y_test.iloc[[sample_explain_index]] >>> rule_obj = pyExp.explain(X_explain, y_explain, search_function = 'CrossoverInterpolation', top_k = 3, max_rules=30, max_iter =5, cv=5, debug = False) >>> pyExp.visualise(rule_obj) """ self.visualisation_data_setup(rule_obj) self.show_visualisation(title)
[docs] def visualisation_data_setup(self, rule_obj): """Set up the data before visualising them Parameters ---------- rule_obj : :obj:`dict` A rule dict generated either through loading the .pyobject file or the .explain(...) function """ top_rules = self.parse_top_rules(top_k_positive_rules=rule_obj['top_k_positive_rules'], top_k_negative_rules=rule_obj['top_k_negative_rules']) self.__set_X_explain(rule_obj['X_explain']) self.__set_y_explain(rule_obj['y_explain']) self.__set_bullet_data(self.generate_bullet_data(top_rules)) self.__set_risk_data(self.generate_risk_data(self.__get_X_explain()))
def __get_bullet_data(self): """Getter of bullet_data Returns ---------- :obj:`list` A list of dict that contains data needed by the d3 bullet chart """ return self.bullet_data def __get_bullet_output(self): """Getter of bullet_output Returns ---------- :obj:`ipywidgets.Output` A Output object used to wrap and locate contents of visualisation """ return self.bullet_output def __get_hbox_items(self): """Getter of hbox_items Returns ---------- :obj:`list` A list of dict that contains items to be in a horizontal box """ return self.hbox_items def __get_risk_data(self): """Getter of risk_data Returns ---------- :obj:`list` A list of dict that contains data needed by the d3 bullet chart """ return self.risk_data def __get_X_explain(self): """Getter of X_explain Returns ---------- :obj:`pandas.core.frame.DataFrame` An explained DataFrame containing the features """ return self.X_explain def __get_y_explain(self): """Getter of y_explain Returns ---------- :obj:`pandas.core.series.Series` An explained DataFrame containing the label """ return self.y_explain def __set_bullet_data(self, bullet_data): """Setter of bullet_data Parameters ---------- bullet_data : :obj:`list` A list of dict that contains data needed by the d3 bullet chart """ if data_validation(bullet_data): self.bullet_data = bullet_data else: print('bullet_data is not in the format of python list of dict') raise ValueError def __set_bullet_output(self, bullet_output): """Setter of bullet_output Parameters ---------- bullet_output : :obj:`widgets.Output` A Output object used to wrap and locate contents of visualisation """ if isinstance(bullet_output, widgets.Output): self.bullet_output = bullet_output else: print("bullet_output should be type 'ipywidgets.Output'") raise TypeError def __set_hbox_items(self, hbox_items): """Setter of hbox_items Parameters ---------- hbox_items : :obj:`list` A list of dict that contains items to be in a horizontal box """ if len(hbox_items) == 4: if isinstance(hbox_items[0], widgets.Label) and isinstance(hbox_items[1], widgets.FloatProgress) \ and isinstance(hbox_items[2], widgets.Label) and isinstance(hbox_items[3], widgets.Label): self.hbox_items = hbox_items else: print("""hbox_items should be in the format of '[widgets.Label, widgets.FloatProgress, widgets.Label, widgets.Label]'""") raise TypeError else: print("""hbox_items should be in the format of '[widgets.Label, widgets.FloatProgress, widgets.Label, widgets.Label]'""") raise TypeError def __set_risk_data(self, risk_data): """Setter of risk_data Parameters ---------- risk_data : :obj:`list` A list of dict that contains risk prediction and risk score info """ if data_validation(risk_data): self.risk_data = risk_data else: print('risk_data is not in the format of python list of dict') raise ValueError def __set_X_explain(self, X_explain): """Setter of X_explain Parameters ---------- X_explain : :obj:`pandas.core.frame.DataFrame` An explained DataFrame containing feature cols """ if isinstance(X_explain, pd.core.frame.DataFrame): self.X_explain = X_explain else: print("X_explain should be type 'pandas.core.frame.DataFrame'") raise TypeError def __set_y_explain(self, y_explain): """Setter of y_explain Parameters ---------- y_explain : :obj:`pandas.core.series.Series` An explained DataFrame containing label col """ if isinstance(y_explain, pd.core.series.Series): self.y_explain = y_explain else: print("y_explain should be type 'pandas.core.series.Series'") raise TypeError