# -*- coding: utf-8 -*-
"""
.. moduleauthor:: Nicholas Long (nicholas.l.long@colorado.edu, nicholas.lee.long@gmail.com)
"""
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(style="ticks", color_codes=True)
[docs]def evaluate_process_cv_results(cv_result_file, response, output_dir):
if os.path.exists(cv_result_file):
# Load the cv results
print("Reading CV results file: %s" % cv_result_file)
df = pd.read_csv(cv_result_file)
df = df.drop('response', 1)
# Fill in the max_depth that has NA when it was set to auto
df = df.fillna(0)
df = df.drop('downsample', 1)
df = df.drop('mean_score_time', 1)
df = df.drop('rank_test_score', 1)
df = df.drop('mean_train_score', 1)
# df = df.drop('max_depth', 1)
df = df.drop(df.columns[[0]], axis=1)
newplt = sns.pairplot(df)
newplt.savefig('%s/fig_cv_%s_pairplot.png' % (output_dir, response))
plt.clf()
# Plot specific xy plots
f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
newplt = sns.jointplot(
df['mean_fit_time'], df['mean_test_score'], kind="hex"
).set_axis_labels('Mean Fit Time (seconds)', 'Mean Test Score (fraction)')
newplt.savefig('%s/fig_cv_%s_time_v_score_hex.png' % (output_dir, response))
plt.clf()
# Plot specific xy plots -- darkgrid background
with plt.rc_context(dict(sns.axes_style("whitegrid"))):
f, ax = plt.subplots(figsize=(6.5, 6.5))
newplt = sns.scatterplot(x=df['mean_fit_time'], y=df['mean_test_score'],
ax=ax).get_figure()
ax.set_xlabel('Mean Fit Time (seconds)')
ax.set_ylabel('Mean Test Score (fraction)')
newplt.savefig('%s/fig_cv_%s_time_v_score.png' % (output_dir, response))
plt.clf()
[docs]def evaluate_process_model_results(model_results_file, output_dir):
if os.path.exists(model_results_file):
# Process the model results
df = pd.read_csv(model_results_file)
# If best exists, then use that, otherwise, just use what is in the column
if 'best' in df.model_type.unique():
df = df[df.model_type == 'best']
# If there are two similar columns then remove one of them and update the name of the remaining item
if all(x in ['ETSHeatingOutletTemperature', 'ETSCoolingOutletTemperature'] for x in df.name.unique()):
df = df[df.name != 'ETSCoolingOutletTemperature']
df.loc[df.name == 'ETSHeatingOutletTemperature', 'name'] = 'ETSOutletTemperature'
# Melt the data for plot purposes
melted_df = pd.melt(
df[['name', 'time_to_build', 'time_to_cv']],
id_vars='name',
var_name='model',
value_name='time'
)
# Plot the data
fig = plt.figure(figsize=(8, 3), dpi=100)
# Defaults to the ax in the figure
ax = sns.barplot(x='time', y='name', hue='model', data=melted_df, ci=None)
ax.set_xlabel('Time (seconds)')
ax.set_ylabel('')
plt.tight_layout()
fig.savefig('%s/fig_time_to_build.png' % output_dir)
fig.clf()
plt.clf()
[docs]def evaluate_process_all_model_results(data, validation_dir):
# For unique_value in data['name'].unique():
sub_df = data[data['model_type'] == 'best'].sort_values(by=['name', 'model_method'])
data.to_csv('%s/all_model_results.csv' % validation_dir, index=False)
keep_cols = ['name', 'model_method', 'pearson']
sub_df[keep_cols].to_csv('%s/pcc_model_results.csv' % validation_dir, index=False)