Source code for mlpath.mlquest.mlquest

'''
This is the main module of mlquest. It contains the mlquest class which is used to log machine learning experiments.
'''
import mlpath.mlquest.utils as utils
from mlpath.mldir_cli.web.app import run_server

# pylint: skip-file
import time
import warnings
import inspect
from varname import  argname
from copy import copy
import pickle
import os
import json
from IPython.display import display, HTML
import shutil
from collections import OrderedDict

[docs]class mlquest(): ''' The mlquest class provides methods and attributes to log machine learning experiments. ''' quests = OrderedDict({}) # dictionary of quests (e.g, one for each model) that contains a list of logs (runs) log = OrderedDict({}) # dictionary of the current log (run) active = False # is a quest already active start_time = None # to compute the duration of the experiment later relative_path = '' # the relative location for where to save the 'mlquests' folder curr_dir = None # the name of the folder containing the current file (for saving purposes) non_default_log = OrderedDict({}) # contains the arguments actually passed to the function log_defs = False # if true, default arguments are also logged quest_name = None # the name of the quest (e.g, the name of the model in the current file) @staticmethod def get_quests_folder(): ''' :meta private: ''' return f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}' @staticmethod def get_quest_folder(): ''' :meta private: ''' return f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}/{mlquest.quest_name}' @staticmethod def get_quest_json_folder(): ''' :meta private: ''' return f'{mlquest.get_quest_folder()}/json' @staticmethod def get_quest_json_file(): ''' :meta private: ''' return f'{mlquest.get_quest_folder()}/json/{mlquest.quest_name}.json' @staticmethod def get_quest_json_config_file(): ''' :meta private: ''' return f'{mlquest.get_quest_folder()}/json/{mlquest.quest_name}-config.json'
[docs] @staticmethod def start_quest(quest_name, **kwargs): ''' Start a new run under the quest with quest_name. This function should be called before any other function with logging functionality. :param quest_name: The name of the experiment this run belongs to (e.g, the name of the model in the current file) :type number: string :Example: The following would start a new quest called 'Naive-Bayes' >>> start_quest('Naive-Bayes') ''' # 1. get the quest folder or make it if it doesn't exist mlquest.relative_path = os.path.dirname(os.path.abspath(__file__)) mlquest.curr_dir = os.path.basename(os.getcwd()) mlquest.quest_name = quest_name quest_folder = mlquest.get_quest_folder() if not os.path.exists(quest_folder): os.makedirs(quest_folder, exist_ok=True) # 2. load the quests dictionary from the file if it exists if 'quests.mlq' in os.listdir(quest_folder): with open(quest_folder + '/quests.mlq', 'rb') as f: mlquest.quests = pickle.load(f) # 3. Initiate the attributes of the new quest to be added to quests later if mlquest.active == True: warnings.warn("Attempting to start a run while another one is active may cause data overwrite") else: mlquest.active = True mlquest.log['info'] = {} mlquest.log['info']['name'] = quest_name mlquest.start_time = time.time() mlquest.log['info']['time'] = time.strftime('%X') mlquest.log['info']['date'] = time.strftime('%x')
[docs] @staticmethod def clear(): ''' Clear the log record of the current run. You may use it while handling exceptions or debugging. ''' if mlquest.active == False: warnings.warn("Attempting to clear the current run when no run is active will do nothing") mlquest.log = {}
[docs] @staticmethod def l(func, name=None): ''' Log the scalar parameters of a function. This function should be called on any function that you want to log the parameters of. :param func: The function to be logged :type func: function :param name: A custom name of the function to be logged. If not given, the name of the function will be used. :type name: string :return: The function wrapped with the logging functionality :Example: The following would log the parameters of the function NaiveBayesFit in the current run log >>> accuracy = mlq.l(NaiveBayesFit)(alpha=1024, beta_param=7, c=12, ) :Notes: - It doesn't matter whether the argument is given through a variable or as a value, it doesn't matter if its given as a named argument or not. :func:`mlq.l()` will log the values under the column corresponding to the name as in the function's signature. - :func:`mlq.l()` always tracks all scalar arguments given to a function that have a name using the function's signature - If you later make a new function then :samp:`MLQuest` may handle this by creating new columns that are empty for the previous runs (rows). - Likewise, deleting a function will make the corresponding columns empty for the future runs (rows). - :func:`mlq.l()` doesn't log collections to avoid having to deal with very large arrays. If your hyperparameter is a small array then you can still stringify it and log it using the :func:`mlq.to_log_ext()` method ''' if mlquest.active == False: warnings.warn("Attempting to log a function when no run is active will do nothing") return func # wrap the function in a more generic version with logging functionality def wrapped(*args, **kwargs): signature = inspect.signature(func) # Get the parameters of the function params = signature.parameters.values() # the default values of the parameters defaults = {param.name: param.default \ for param in params \ if param.default != inspect._empty and kwargs.get(param.name) is None} # will have all the set values of the parameters values = {} for i, param in enumerate(params): # positional arguments not given as keyword arguments must be here if i < len(args): data = utils.stringify(args[i]) if data is not None: values[param.name] = data # the rest of the parameters are positional arguments given by name or defaults or kwargs # or are keyword arguments in **kwargs for key, value in kwargs.items(): data = utils.stringify(value) if data is not None: values[key] = data non_def_values = copy(values) for key, value in defaults.items(): data = utils.stringify(value) if data is not None: values[key] = data # Now set the values in the log with the key being the name of the function if name: mlquest.log[name] = values mlquest.non_default_log[name] = non_def_values else: mlquest.log[func.__name__] = values mlquest.non_default_log[func.__name__] = non_def_values return func(*args, **kwargs) return wrapped
[docs] @staticmethod def log_metrics(m1=None, m2=None, m3=None, m4=None, m5=None, m6=None, m7=None, m8=None, m9=None, m10=None, **kwargs): ''' Log the metrics of the experiment. As an experimental feature, if the metrics are given as positional arguments, they will be logged with the name of the variable given to them. If they are given as keyword arguments, they will be logged with the name as the keyword. :param mi: The ith metric to be logged :type mi: scalar :example: >>> acc = mlq.l(NaiveBayes)(alpha=1024, beta_param=7, c=12, ) >>> mlq.log_metrics(acc) This would log the accuracy of the NaiveBayes under the column 'acc'. To provide a different name than that of the variable you can use the keyword argument syntax: >>> mlq.log_metrics(accuracy=acc) :Notes: - Your metric should be a scalar. You may need to convert a Numpy array into a scalar by using the :samp:`metric.item()`. - You can log multiple metrics at once using this function ''' if mlquest.active == False: warnings.warn("Attempting to log a metric when no run is active will do nothing") mlquest.log['metrics'] = {} mlquest.non_default_log['metrics'] = {} # See if any of m1-m10 are set and if so, add them to the log with the key being the vairable name for i in range(1, 11): if locals()[f'm{i}'] is not None: with warnings.catch_warnings(): warnings.simplefilter("ignore") # ignores a useless warning of the varname library data = utils.stringify(locals()[f'm{i}']) if data is not None: mlquest.log['metrics'][argname(f'm{i}')] = data mlquest.non_default_log['metrics'][argname(f'm{i}')] = data # Any kwargs are metrics with custom names, add them as well for key, value in kwargs.items(): data = utils.stringify(value) if data is not None: mlquest.log['metrics'][key] = data mlquest.non_default_log['metrics'][key] = data else: print(data) warnings.warn(f"Metric {key} is either None or not a scalar and thus can't be logged")
[docs] @staticmethod def to_log(col_name, dict=None, **kwargs): ''' Grants logging with extensive access to the log. :param col_name: The name of the column to log to :type col_name: string :param dict: A dictionary of the key (subcolumns), values to be logged under col_name column :param kwargs: key value pairs to be llogged under col_name column (an alternative to dict) :Example: >>> mlq.to_log('graphs', Scatterplot='../plots/plt21.jpg', Histogram='../plots/plt22.jpg') This would log the Scatterplot and Histogram under the 'graphs' column. Any previous runs will have empty values for these columns. ''' if dict is not None: mlquest.log[col_name] = dict else: # check if mlquest.log[col_name] exists, if not, create it if col_name not in mlquest.log: mlquest.log[col_name] = {} mlquest.non_default_log[col_name] = {} for key, value in kwargs.items(): mlquest.log[col_name][key] = value mlquest.non_default_log[col_name][key] = value
@staticmethod def save_quest(): ''' Uses pickle to save the quests object to a file. :meta private: ''' quest_folder = mlquest.get_quest_folder() quests_folder = mlquest.get_quests_folder() # see if there is a 'mlquests' folder, if not, create it if not os.path.exists(quests_folder): os.makedirs(quests_folder, exist_ok=True) with open(quest_folder + '/quests.mlq', 'wb') as f: pickle.dump(mlquest.quests, f) @staticmethod def save_logs(save_path='./'): ''' Saves the logs of a quest in a table. :param save_path (optional): The path to save the logs to. Defaults to the current directory :type save_path: string :meta private: ''' quest_folder = mlquest.get_quest_folder() quest_name = mlquest.quest_name # copy the quests table to the desired location shutil.copyfile(quest_folder + f'/{quest_name}.md', f'{save_path}/{quest_name}.md')
[docs] @staticmethod def end_quest(save_ext=None, blacklist=[], log_defs=False,): ''' ends an active run and internally saves it to the log. This must called at the end of the experiment else it will not be logged. :param save_ext (optional): Where to save the log externally. Defaults to not saving externally at all (None). :type save_ext: string :param blacklist (optional): A list of columns to not log. Defaults to an empty list. The column can be passed as 'col_name' or 'header.col_name' if there is a clash in names. :type blacklist: list :param log_defs (optional): If true, it adds all the default columns that are not explictly passed to the blacklist. :type log_defs: bool :Example: ''' if mlquest.active == False: warnings.warn('No active mlquest to end') else: mlquest.log_defs = log_defs duration = time.time() - mlquest.start_time # set the duration of the experiment with the appropriate unit if duration < 1: mlquest.log['info']['duration'] = f'{duration * 1000:.2f} ms' elif duration < 60: mlquest.log['info']['duration'] = f'{duration:.2f} s' elif duration > 60: mlquest.log['info']['duration'] = f'{duration / 60:.2f} min' elif duration > 3600: mlquest.log['info']['duration'] = f'{duration / 3600:.2f} h' # check if the experiment already exists and set its name and id if mlquest.log['info']['name'] in mlquest.quests: quest_name = mlquest.log['info']['name'] # get the id of the last experiment id = int(mlquest.quests[quest_name][-1]['info']['id'])+1 mlquest.log['info']['id'] = id mlquest.quests[quest_name].append(mlquest.log) else: id = 1 mlquest.log['info']['id'] = id quest_name = mlquest.log['info']['name'] mlquest.quests[quest_name] = [mlquest.log] runs_to_json(mlquest.quests[quest_name], mlquest.log_defs, mlquest.non_default_log, blacklist) json_to_html_table(last_k=None, save=True) if save_ext is not None: mlquest.save_logs(save_ext) mlquest.active = False mlquest.log = {} mlquest.save_quest()
[docs] @staticmethod def show_logs(*args, last_k=None, highlight='yellow', **kwargs): ''' Shows the logs of a quest in a table that can be rendered in a jupyter notebook. :param last_k (optional): The number of (most recent) experiments to show. Defaults to all experiments. :type last_k: int :param highlight (optional): The color to highlight the most recent experiment with. Defaults to yellow. :type highlight: string :Example: >>> mlq.show_logs('NaiveBayesExp') ''' # get the name of the folder containing the current file quest_name = mlquest.quest_name mlquest.curr_dir = os.path.basename(os.getcwd()) assert os.path.exists(f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}/{quest_name}'),\ f'Quest {quest_name} does not exist yet. Please start a quest with that name first.' if 'quests.mlq' in os.listdir(f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}/{quest_name}'): with open(mlquest.relative_path + f'/Quests/{mlquest.curr_dir}/{quest_name}/quests.mlq', 'rb') as f: mlquest.quests = pickle.load(f) # convert the file to html table table = json_to_html_table(last_k=last_k, color=highlight) # display the table display(HTML(table))
[docs] @staticmethod def run_server(): ''' Runs the server to display the logs of the quests in a web browser. This includes all the quests in the current directory. ''' run_server()
[docs] @staticmethod def delete_runs(run_ids): ''' permanently deletes runs that have ids in run_ids from the log. :param run_ids: The ids (indecies) of the runs to be deleted :type run_ids: list of ints :Example: >>> mlq.delete_runs([1, 2, 3]) This would delete the runs with ids 1, 2, and 3 from the current quest ''' quest_name = mlquest.quest_name quest_folder = mlquest.get_quest_folder() # read the mlq file from table_dest and quest_name with open(quest_folder + '/quests.mlq', 'rb') as f: data = dict(pickle.load(f)) for run_id in run_ids: # loop on the runs and delete the run with the given id for i, run in enumerate(data[quest_name]): if run['info']['id'] == run_id: del data[quest_name][i] break if i == len(data[quest_name])-1: warnings.warn(f"Run id {run_id} does not exist; failed to delete") # save the data to the mlq file with open(quest_folder + '/quests.mlq', 'wb') as f: pickle.dump(data, f) # update the json and html files runs_to_json(data[quest_name], None, None, []) json_to_html_table(last_k=None, save=True)
[docs] @staticmethod def get_flat_dict(show_all=False): ''' Convert the quests table to a flat dictionary. This is helpful if the table is needed in a csv or dataframe format. :param show_all: If True, the dictionary will contain all the columns in the table. If False, it will obey the blacklist and log_defs sent to end_quest. ''' # read the json file json_file = mlquest.get_quest_json_file() json_config_file = mlquest.get_quest_json_config_file() with open(json_file, 'r') as f: j = json.load(f) with open(json_config_file, 'r') as f: config = json.load(f) # now lets flatten the dict flat_dict = {} for key in j.keys(): for subkey in j[key].keys(): if show_all: flat_dict[f'{subkey}'] = j[key][subkey] else: if config[key][subkey] == 'true': flat_dict[f'{subkey}'] = j[key][subkey] return flat_dict
[docs] @staticmethod def delete_quest(quest_name): relative_path = os.path.dirname(os.path.abspath(__file__)) curr_dir = os.path.basename(os.getcwd()) quest_folder = f'{relative_path}/Quests/{curr_dir}/{quest_name}' shutil.rmtree(quest_folder)
def remove_duplicate_rows(json_obj): ''' Removes duplicate rows from the given nested json_obj which is expected to be a two-level nested dictionary where the values are lists representing column values. :meta private: ''' # read json file from path as dict num_rows = len(json_obj['info']['id']) rows_to_remove = [] old_row_values, new_row_values = [], [] for i in range(num_rows): for key in json_obj.keys(): for subkey in json_obj[key].keys(): value = json_obj[key][subkey][i] # if the key is not info then append the value to the list if key != 'info': new_row_values.append(value) if new_row_values == old_row_values: rows_to_remove.append(i) old_row_values = new_row_values new_row_values = [] # remove the duplicate rows for i in range(len(rows_to_remove)-1, -1, -1): k = rows_to_remove[i] for key in json_obj.keys(): for subkey in json_obj[key].keys(): json_obj[key][subkey].pop(k) return json_obj def get_path_mask(json_obj): ''' Given a json_obj return a mask_obj of the same structure (two level dictionary of keys and subkeys and where values are lists) this returns a mask of the same shape as the json_obj but where if a value is different from the previous row then it is a 1, otherwise it is a 0. :meta private: ''' # make mask obj of the same structure as json_obj mask_obj = {} for key in json_obj.keys(): mask_obj[key] = {} for subkey in json_obj[key].keys(): mask_obj[key][subkey] = [] num_rows = len(json_obj['info']['id']) for i in range(num_rows): for key in json_obj.keys(): for subkey in json_obj[key].keys(): value = json_obj[key][subkey][i] if key != 'info': if i != 0: if value != json_obj[key][subkey][i-1]: mask_obj[key][subkey].append(1) else : mask_obj[key][subkey].append(0) else: mask_obj[key][subkey].append(0) else: mask_obj[key][subkey].append(0) return mask_obj def runs_to_json(runs, log_defs, non_default_log, blacklist): ''' converts the runs of a quest to a json file :runs: a list of runs :log_defs: whether or not to log the default parameters :non_default_log: a dictionary of non-default parameters to log :blacklist: a list of parameters to not log :meta private: ''' json_folder = mlquest.get_quest_json_folder() json_file = mlquest.get_quest_json_file() json_config_file = mlquest.get_quest_json_config_file() if not os.path.exists(json_folder): os.makedirs(json_folder, exist_ok=True) big_dict = utils.merge_dicts(runs) # remove ['info]['name'] from the dict del big_dict['info']['name'] # now convert to json j = json.dumps(big_dict, indent=4) # save the json file with open(json_file, 'w') as f: f.write(j) if log_defs is not None and non_default_log is not None: # Now lets make a version of big_dict called config_dict that replaces all the leaf values with 'true' config_dict = {} if log_defs==True: for key in big_dict.keys(): config_dict[key] = {} for subkey in big_dict[key].keys(): if subkey not in blacklist: config_dict[key][subkey] = 'true' else: config_dict[key][subkey] = 'false' else: # let's get the set of subkeys that are in the non_default_log for key in big_dict.keys(): config_dict[key] = {} for subkey in big_dict[key].keys(): if key in non_default_log.keys(): if subkey not in non_default_log[key].keys() or subkey in blacklist: config_dict[key][subkey] = 'false' else: config_dict[key][subkey] = 'true' else: config_dict[key][subkey] = 'false' for item in blacklist: if '.' in item: key, subkey = item.split('.') config_dict[key][subkey] = "false" # convert to json c = json.dumps(config_dict, indent=4) # save the json file with open(json_config_file, 'w') as f: f.write(c) def json_to_html_table(last_k, color='yellow', save=False): ''' Makes an html table from a nested json file. :last_k: the number of rows to show :color: the color of the last k rows :save: whether or not to save the html file as markdown :meta private: ''' json_path = mlquest.get_quest_json_file() config_path = mlquest.get_quest_json_config_file() quest_path = mlquest.get_quest_folder() # read json file from path as dict with open(json_path, 'rb') as JSON: json_obj = json.load(JSON) with open(config_path, 'rb') as JSON: config_obj = json.load(JSON) json_obj = remove_duplicate_rows(json_obj) mask_obj = get_path_mask(json_obj) # convert to html table table = '<table>\n' # make a header row table += '<tr>\n' # for each key in the top-level dict make a column with colspan being the number of subkeys for key in json_obj.keys(): # the length of the colspan is the number of subkeys with value 'true' in the config file length = [config_obj[key][subkey] for subkey in config_obj[key].keys()].count('true') if length > 0 : table += f'<th colspan={length} style="text-align: center; vertical-align: middle;">{key}</th>\n' table += '</tr>\n' # for each subkey of the top-level dict, make a subheader row for key in json_obj.keys(): for subkey in json_obj[key].keys(): if config_obj[key][subkey] == 'true': table += f'<th style="text-align: center; vertical-align: middle;">{subkey}</th>\n' table += '</tr>\n' # get the number of ids to infer the number of rows num_rows = len(json_obj['info']['id']) if last_k is None: last_k = num_rows if last_k > num_rows: last_k = num_rows for i in range(num_rows - last_k, num_rows): table += '<tr>\n' for key in json_obj.keys(): for subkey in json_obj[key].keys(): if config_obj[key][subkey] == 'true': html_color = color if mask_obj[key][subkey][i] and color else '' value = json_obj[key][subkey][i] if json_obj[key][subkey][i] is not None else '' table += f'<td style="text-align: center; vertical-align: middle;"> <font color={html_color}>{value}</font></td>\n' table += '</tr>\n' # save the html file if save: if not os.path.exists(quest_path): os.makedirs(quest_path, exist_ok=True) with open(quest_path + f'/{mlquest.quest_name}.md', 'w') as f: f.write(table) # return the html table return table