'''
This is the main module of mlquest. It contains the mlquest class which is used to log machine learning experiments.
'''
import mlpath.mlquest.utils as utils
from mlpath.mldir_cli.web.app import run_server
# pylint: skip-file
import time
import warnings
import inspect
from varname import argname
from copy import copy
import pickle
import os
import json
from IPython.display import display, HTML
import shutil
from collections import OrderedDict
[docs]class mlquest():
'''
The mlquest class provides methods and attributes to log machine learning experiments.
'''
quests = OrderedDict({}) # dictionary of quests (e.g, one for each model) that contains a list of logs (runs)
log = OrderedDict({}) # dictionary of the current log (run)
active = False # is a quest already active
start_time = None # to compute the duration of the experiment later
relative_path = '' # the relative location for where to save the 'mlquests' folder
curr_dir = None # the name of the folder containing the current file (for saving purposes)
non_default_log = OrderedDict({}) # contains the arguments actually passed to the function
log_defs = False # if true, default arguments are also logged
quest_name = None # the name of the quest (e.g, the name of the model in the current file)
@staticmethod
def get_quests_folder():
'''
:meta private:
'''
return f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}'
@staticmethod
def get_quest_folder():
'''
:meta private:
'''
return f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}/{mlquest.quest_name}'
@staticmethod
def get_quest_json_folder():
'''
:meta private:
'''
return f'{mlquest.get_quest_folder()}/json'
@staticmethod
def get_quest_json_file():
'''
:meta private:
'''
return f'{mlquest.get_quest_folder()}/json/{mlquest.quest_name}.json'
@staticmethod
def get_quest_json_config_file():
'''
:meta private:
'''
return f'{mlquest.get_quest_folder()}/json/{mlquest.quest_name}-config.json'
[docs] @staticmethod
def start_quest(quest_name, **kwargs):
'''
Start a new run under the quest with quest_name. This function should be called before any other function with logging functionality.
:param quest_name: The name of the experiment this run belongs to (e.g, the name of the model in the current file)
:type number: string
:Example:
The following would start a new quest called 'Naive-Bayes'
>>> start_quest('Naive-Bayes')
'''
# 1. get the quest folder or make it if it doesn't exist
mlquest.relative_path = os.path.dirname(os.path.abspath(__file__))
mlquest.curr_dir = os.path.basename(os.getcwd())
mlquest.quest_name = quest_name
quest_folder = mlquest.get_quest_folder()
if not os.path.exists(quest_folder):
os.makedirs(quest_folder, exist_ok=True)
# 2. load the quests dictionary from the file if it exists
if 'quests.mlq' in os.listdir(quest_folder):
with open(quest_folder + '/quests.mlq', 'rb') as f:
mlquest.quests = pickle.load(f)
# 3. Initiate the attributes of the new quest to be added to quests later
if mlquest.active == True:
warnings.warn("Attempting to start a run while another one is active may cause data overwrite")
else:
mlquest.active = True
mlquest.log['info'] = {}
mlquest.log['info']['name'] = quest_name
mlquest.start_time = time.time()
mlquest.log['info']['time'] = time.strftime('%X')
mlquest.log['info']['date'] = time.strftime('%x')
[docs] @staticmethod
def clear():
'''
Clear the log record of the current run. You may use it while handling exceptions or debugging.
'''
if mlquest.active == False: warnings.warn("Attempting to clear the current run when no run is active will do nothing")
mlquest.log = {}
[docs] @staticmethod
def l(func, name=None):
'''
Log the scalar parameters of a function. This function should be called on any function that you want to log the parameters of.
:param func: The function to be logged
:type func: function
:param name: A custom name of the function to be logged. If not given, the name of the function will be used.
:type name: string
:return: The function wrapped with the logging functionality
:Example:
The following would log the parameters of the function NaiveBayesFit in the current run log
>>> accuracy = mlq.l(NaiveBayesFit)(alpha=1024, beta_param=7, c=12, )
:Notes:
- It doesn't matter whether the argument is given through a variable or as a value, it doesn't matter if its given as a named argument or not. :func:`mlq.l()` will log the values under the column corresponding to the name as in the function's signature.
- :func:`mlq.l()` always tracks all scalar arguments given to a function that have a name using the function's signature
- If you later make a new function then :samp:`MLQuest` may handle this by creating new columns that are empty for the previous runs (rows).
- Likewise, deleting a function will make the corresponding columns empty for the future runs (rows).
- :func:`mlq.l()` doesn't log collections to avoid having to deal with very large arrays. If your hyperparameter is a small array then you can still stringify it and log it using the :func:`mlq.to_log_ext()` method
'''
if mlquest.active == False:
warnings.warn("Attempting to log a function when no run is active will do nothing")
return func
# wrap the function in a more generic version with logging functionality
def wrapped(*args, **kwargs):
signature = inspect.signature(func)
# Get the parameters of the function
params = signature.parameters.values()
# the default values of the parameters
defaults = {param.name: param.default \
for param in params \
if param.default != inspect._empty and kwargs.get(param.name) is None}
# will have all the set values of the parameters
values = {}
for i, param in enumerate(params):
# positional arguments not given as keyword arguments must be here
if i < len(args):
data = utils.stringify(args[i])
if data is not None: values[param.name] = data
# the rest of the parameters are positional arguments given by name or defaults or kwargs
# or are keyword arguments in **kwargs
for key, value in kwargs.items():
data = utils.stringify(value)
if data is not None: values[key] = data
non_def_values = copy(values)
for key, value in defaults.items():
data = utils.stringify(value)
if data is not None: values[key] = data
# Now set the values in the log with the key being the name of the function
if name:
mlquest.log[name] = values
mlquest.non_default_log[name] = non_def_values
else:
mlquest.log[func.__name__] = values
mlquest.non_default_log[func.__name__] = non_def_values
return func(*args, **kwargs)
return wrapped
[docs] @staticmethod
def log_metrics(m1=None, m2=None, m3=None, m4=None, m5=None, m6=None, m7=None, m8=None, m9=None, m10=None, **kwargs):
'''
Log the metrics of the experiment. As an experimental feature, if the metrics are given as positional arguments,
they will be logged with the name of the variable given to them. If they are given as keyword arguments, they will
be logged with the name as the keyword.
:param mi: The ith metric to be logged
:type mi: scalar
:example:
>>> acc = mlq.l(NaiveBayes)(alpha=1024, beta_param=7, c=12, )
>>> mlq.log_metrics(acc)
This would log the accuracy of the NaiveBayes under the column 'acc'. To provide a different name than that of the variable
you can use the keyword argument syntax:
>>> mlq.log_metrics(accuracy=acc)
:Notes:
- Your metric should be a scalar. You may need to convert a Numpy array into a scalar by using the :samp:`metric.item()`.
- You can log multiple metrics at once using this function
'''
if mlquest.active == False:
warnings.warn("Attempting to log a metric when no run is active will do nothing")
mlquest.log['metrics'] = {}
mlquest.non_default_log['metrics'] = {}
# See if any of m1-m10 are set and if so, add them to the log with the key being the vairable name
for i in range(1, 11):
if locals()[f'm{i}'] is not None:
with warnings.catch_warnings():
warnings.simplefilter("ignore") # ignores a useless warning of the varname library
data = utils.stringify(locals()[f'm{i}'])
if data is not None:
mlquest.log['metrics'][argname(f'm{i}')] = data
mlquest.non_default_log['metrics'][argname(f'm{i}')] = data
# Any kwargs are metrics with custom names, add them as well
for key, value in kwargs.items():
data = utils.stringify(value)
if data is not None:
mlquest.log['metrics'][key] = data
mlquest.non_default_log['metrics'][key] = data
else:
print(data)
warnings.warn(f"Metric {key} is either None or not a scalar and thus can't be logged")
[docs] @staticmethod
def to_log(col_name, dict=None, **kwargs):
'''
Grants logging with extensive access to the log.
:param col_name: The name of the column to log to
:type col_name: string
:param dict: A dictionary of the key (subcolumns), values to be logged under col_name column
:param kwargs: key value pairs to be llogged under col_name column (an alternative to dict)
:Example:
>>> mlq.to_log('graphs', Scatterplot='../plots/plt21.jpg', Histogram='../plots/plt22.jpg')
This would log the Scatterplot and Histogram under the 'graphs' column. Any previous runs will have empty values for these columns.
'''
if dict is not None:
mlquest.log[col_name] = dict
else:
# check if mlquest.log[col_name] exists, if not, create it
if col_name not in mlquest.log:
mlquest.log[col_name] = {}
mlquest.non_default_log[col_name] = {}
for key, value in kwargs.items():
mlquest.log[col_name][key] = value
mlquest.non_default_log[col_name][key] = value
@staticmethod
def save_quest():
'''
Uses pickle to save the quests object to a file.
:meta private:
'''
quest_folder = mlquest.get_quest_folder()
quests_folder = mlquest.get_quests_folder()
# see if there is a 'mlquests' folder, if not, create it
if not os.path.exists(quests_folder):
os.makedirs(quests_folder, exist_ok=True)
with open(quest_folder + '/quests.mlq', 'wb') as f:
pickle.dump(mlquest.quests, f)
@staticmethod
def save_logs(save_path='./'):
'''
Saves the logs of a quest in a table.
:param save_path (optional): The path to save the logs to. Defaults to the current directory
:type save_path: string
:meta private:
'''
quest_folder = mlquest.get_quest_folder()
quest_name = mlquest.quest_name
# copy the quests table to the desired location
shutil.copyfile(quest_folder + f'/{quest_name}.md', f'{save_path}/{quest_name}.md')
[docs] @staticmethod
def end_quest(save_ext=None, blacklist=[], log_defs=False,):
'''
ends an active run and internally saves it to the log. This must called at the end of the experiment else it will not be logged.
:param save_ext (optional): Where to save the log externally. Defaults to not saving externally at all (None).
:type save_ext: string
:param blacklist (optional): A list of columns to not log. Defaults to an empty list. The column can be passed as 'col_name' or 'header.col_name' if there is a clash in names.
:type blacklist: list
:param log_defs (optional): If true, it adds all the default columns that are not explictly passed to the blacklist.
:type log_defs: bool
:Example:
'''
if mlquest.active == False: warnings.warn('No active mlquest to end')
else:
mlquest.log_defs = log_defs
duration = time.time() - mlquest.start_time
# set the duration of the experiment with the appropriate unit
if duration < 1:
mlquest.log['info']['duration'] = f'{duration * 1000:.2f} ms'
elif duration < 60:
mlquest.log['info']['duration'] = f'{duration:.2f} s'
elif duration > 60:
mlquest.log['info']['duration'] = f'{duration / 60:.2f} min'
elif duration > 3600:
mlquest.log['info']['duration'] = f'{duration / 3600:.2f} h'
# check if the experiment already exists and set its name and id
if mlquest.log['info']['name'] in mlquest.quests:
quest_name = mlquest.log['info']['name']
# get the id of the last experiment
id = int(mlquest.quests[quest_name][-1]['info']['id'])+1
mlquest.log['info']['id'] = id
mlquest.quests[quest_name].append(mlquest.log)
else:
id = 1
mlquest.log['info']['id'] = id
quest_name = mlquest.log['info']['name']
mlquest.quests[quest_name] = [mlquest.log]
runs_to_json(mlquest.quests[quest_name], mlquest.log_defs, mlquest.non_default_log, blacklist)
json_to_html_table(last_k=None, save=True)
if save_ext is not None: mlquest.save_logs(save_ext)
mlquest.active = False
mlquest.log = {}
mlquest.save_quest()
[docs] @staticmethod
def show_logs(*args, last_k=None, highlight='yellow', **kwargs):
'''
Shows the logs of a quest in a table that can be rendered in a jupyter notebook.
:param last_k (optional): The number of (most recent) experiments to show. Defaults to all experiments.
:type last_k: int
:param highlight (optional): The color to highlight the most recent experiment with. Defaults to yellow.
:type highlight: string
:Example:
>>> mlq.show_logs('NaiveBayesExp')
'''
# get the name of the folder containing the current file
quest_name = mlquest.quest_name
mlquest.curr_dir = os.path.basename(os.getcwd())
assert os.path.exists(f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}/{quest_name}'),\
f'Quest {quest_name} does not exist yet. Please start a quest with that name first.'
if 'quests.mlq' in os.listdir(f'{mlquest.relative_path}/Quests/{mlquest.curr_dir}/{quest_name}'):
with open(mlquest.relative_path + f'/Quests/{mlquest.curr_dir}/{quest_name}/quests.mlq', 'rb') as f:
mlquest.quests = pickle.load(f)
# convert the file to html table
table = json_to_html_table(last_k=last_k, color=highlight)
# display the table
display(HTML(table))
[docs] @staticmethod
def run_server():
'''
Runs the server to display the logs of the quests in a web browser. This includes all the quests in the current directory.
'''
run_server()
[docs] @staticmethod
def delete_runs(run_ids):
'''
permanently deletes runs that have ids in run_ids from the log.
:param run_ids: The ids (indecies) of the runs to be deleted
:type run_ids: list of ints
:Example:
>>> mlq.delete_runs([1, 2, 3])
This would delete the runs with ids 1, 2, and 3 from the current quest
'''
quest_name = mlquest.quest_name
quest_folder = mlquest.get_quest_folder()
# read the mlq file from table_dest and quest_name
with open(quest_folder + '/quests.mlq', 'rb') as f:
data = dict(pickle.load(f))
for run_id in run_ids:
# loop on the runs and delete the run with the given id
for i, run in enumerate(data[quest_name]):
if run['info']['id'] == run_id:
del data[quest_name][i]
break
if i == len(data[quest_name])-1:
warnings.warn(f"Run id {run_id} does not exist; failed to delete")
# save the data to the mlq file
with open(quest_folder + '/quests.mlq', 'wb') as f:
pickle.dump(data, f)
# update the json and html files
runs_to_json(data[quest_name], None, None, [])
json_to_html_table(last_k=None, save=True)
[docs] @staticmethod
def get_flat_dict(show_all=False):
'''
Convert the quests table to a flat dictionary. This is helpful if the table is needed in a csv or dataframe format.
:param show_all: If True, the dictionary will contain all the columns in the table. If False, it will obey the blacklist and log_defs sent to end_quest.
'''
# read the json file
json_file = mlquest.get_quest_json_file()
json_config_file = mlquest.get_quest_json_config_file()
with open(json_file, 'r') as f:
j = json.load(f)
with open(json_config_file, 'r') as f:
config = json.load(f)
# now lets flatten the dict
flat_dict = {}
for key in j.keys():
for subkey in j[key].keys():
if show_all:
flat_dict[f'{subkey}'] = j[key][subkey]
else:
if config[key][subkey] == 'true':
flat_dict[f'{subkey}'] = j[key][subkey]
return flat_dict
[docs] @staticmethod
def delete_quest(quest_name):
relative_path = os.path.dirname(os.path.abspath(__file__))
curr_dir = os.path.basename(os.getcwd())
quest_folder = f'{relative_path}/Quests/{curr_dir}/{quest_name}'
shutil.rmtree(quest_folder)
def remove_duplicate_rows(json_obj):
'''
Removes duplicate rows from the given nested json_obj which is expected to be a
two-level nested dictionary where the values are lists representing column values.
:meta private:
'''
# read json file from path as dict
num_rows = len(json_obj['info']['id'])
rows_to_remove = []
old_row_values, new_row_values = [], []
for i in range(num_rows):
for key in json_obj.keys():
for subkey in json_obj[key].keys():
value = json_obj[key][subkey][i]
# if the key is not info then append the value to the list
if key != 'info': new_row_values.append(value)
if new_row_values == old_row_values:
rows_to_remove.append(i)
old_row_values = new_row_values
new_row_values = []
# remove the duplicate rows
for i in range(len(rows_to_remove)-1, -1, -1):
k = rows_to_remove[i]
for key in json_obj.keys():
for subkey in json_obj[key].keys():
json_obj[key][subkey].pop(k)
return json_obj
def get_path_mask(json_obj):
'''
Given a json_obj return a mask_obj of the same structure (two level dictionary of keys and subkeys
and where values are lists) this returns a mask of the same shape as the json_obj but where if a value
is different from the previous row then it is a 1, otherwise it is a 0.
:meta private:
'''
# make mask obj of the same structure as json_obj
mask_obj = {}
for key in json_obj.keys():
mask_obj[key] = {}
for subkey in json_obj[key].keys():
mask_obj[key][subkey] = []
num_rows = len(json_obj['info']['id'])
for i in range(num_rows):
for key in json_obj.keys():
for subkey in json_obj[key].keys():
value = json_obj[key][subkey][i]
if key != 'info':
if i != 0:
if value != json_obj[key][subkey][i-1]:
mask_obj[key][subkey].append(1)
else :
mask_obj[key][subkey].append(0)
else:
mask_obj[key][subkey].append(0)
else:
mask_obj[key][subkey].append(0)
return mask_obj
def runs_to_json(runs, log_defs, non_default_log, blacklist):
'''
converts the runs of a quest to a json file
:runs: a list of runs
:log_defs: whether or not to log the default parameters
:non_default_log: a dictionary of non-default parameters to log
:blacklist: a list of parameters to not log
:meta private:
'''
json_folder = mlquest.get_quest_json_folder()
json_file = mlquest.get_quest_json_file()
json_config_file = mlquest.get_quest_json_config_file()
if not os.path.exists(json_folder):
os.makedirs(json_folder, exist_ok=True)
big_dict = utils.merge_dicts(runs)
# remove ['info]['name'] from the dict
del big_dict['info']['name']
# now convert to json
j = json.dumps(big_dict, indent=4)
# save the json file
with open(json_file, 'w') as f:
f.write(j)
if log_defs is not None and non_default_log is not None:
# Now lets make a version of big_dict called config_dict that replaces all the leaf values with 'true'
config_dict = {}
if log_defs==True:
for key in big_dict.keys():
config_dict[key] = {}
for subkey in big_dict[key].keys():
if subkey not in blacklist:
config_dict[key][subkey] = 'true'
else:
config_dict[key][subkey] = 'false'
else:
# let's get the set of subkeys that are in the non_default_log
for key in big_dict.keys():
config_dict[key] = {}
for subkey in big_dict[key].keys():
if key in non_default_log.keys():
if subkey not in non_default_log[key].keys() or subkey in blacklist:
config_dict[key][subkey] = 'false'
else:
config_dict[key][subkey] = 'true'
else:
config_dict[key][subkey] = 'false'
for item in blacklist:
if '.' in item:
key, subkey = item.split('.')
config_dict[key][subkey] = "false"
# convert to json
c = json.dumps(config_dict, indent=4)
# save the json file
with open(json_config_file, 'w') as f:
f.write(c)
def json_to_html_table(last_k, color='yellow', save=False):
'''
Makes an html table from a nested json file.
:last_k: the number of rows to show
:color: the color of the last k rows
:save: whether or not to save the html file as markdown
:meta private:
'''
json_path = mlquest.get_quest_json_file()
config_path = mlquest.get_quest_json_config_file()
quest_path = mlquest.get_quest_folder()
# read json file from path as dict
with open(json_path, 'rb') as JSON:
json_obj = json.load(JSON)
with open(config_path, 'rb') as JSON:
config_obj = json.load(JSON)
json_obj = remove_duplicate_rows(json_obj)
mask_obj = get_path_mask(json_obj)
# convert to html table
table = '<table>\n'
# make a header row
table += '<tr>\n'
# for each key in the top-level dict make a column with colspan being the number of subkeys
for key in json_obj.keys():
# the length of the colspan is the number of subkeys with value 'true' in the config file
length = [config_obj[key][subkey] for subkey in config_obj[key].keys()].count('true')
if length > 0 :
table += f'<th colspan={length} style="text-align: center; vertical-align: middle;">{key}</th>\n'
table += '</tr>\n'
# for each subkey of the top-level dict, make a subheader row
for key in json_obj.keys():
for subkey in json_obj[key].keys():
if config_obj[key][subkey] == 'true':
table += f'<th style="text-align: center; vertical-align: middle;">{subkey}</th>\n'
table += '</tr>\n'
# get the number of ids to infer the number of rows
num_rows = len(json_obj['info']['id'])
if last_k is None: last_k = num_rows
if last_k > num_rows: last_k = num_rows
for i in range(num_rows - last_k, num_rows):
table += '<tr>\n'
for key in json_obj.keys():
for subkey in json_obj[key].keys():
if config_obj[key][subkey] == 'true':
html_color = color if mask_obj[key][subkey][i] and color else ''
value = json_obj[key][subkey][i] if json_obj[key][subkey][i] is not None else ''
table += f'<td style="text-align: center; vertical-align: middle;"> <font color={html_color}>{value}</font></td>\n'
table += '</tr>\n'
# save the html file
if save:
if not os.path.exists(quest_path):
os.makedirs(quest_path, exist_ok=True)
with open(quest_path + f'/{mlquest.quest_name}.md', 'w') as f:
f.write(table)
# return the html table
return table