Source code for rom.validation_helpers

# -*- coding: utf-8 -*-
"""
.. moduleauthor:: Nicholas Long (nicholas.l.long@colorado.edu, nicholas.lee.long@gmail.com)
"""

from collections import OrderedDict
from math import sqrt

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.plotting import lag_plot
from sklearn.metrics import mean_squared_error

from .shared import save_dict_to_csv


[docs]def validation_plot_energy_temp(melted_df, filename): with plt.rc_context(dict(sns.axes_style("whitegrid"))): f, ax = plt.subplots(figsize=(8, 5)) newplt = sns.scatterplot( x="SiteOutdoorAirDrybulbTemperature", y="Energy", style="Model", data=melted_df, ax=ax).get_figure() ax.set_xlabel('Site Outdoor Air Drybulb Temperature (deg C)') ax.set_ylabel('Total HVAC Power (W)') newplt.savefig(filename) newplt.clf() plt.clf()
[docs]def validation_plot_timeseries(melted_df, filename): def date_formatter(x, pos): return pd.to_datetime(x).strftime('%Y-%m-%d\n %H:%M') sns.set(color_codes=True) plt.rcParams['figure.figsize'] = [10, 4] with plt.rc_context(dict(sns.axes_style("whitegrid"))): fig, ax = plt.subplots() # sns.lineplot(x='DateTime', y='Value', hue='Variable', data=melted_df, ax=ax) sns.tsplot(melted_df, time='DateTime', unit='Dummy', condition='Variable', value='Value', ax=ax) # convert all xtick labels to selected format from ms timestamp # ax.set_xticklabels( # [pd.to_datetime(tm).strftime('%Y-%m-%d\n %H:%M:%S') for tm in ax.get_xticks()], # rotation=50) ax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(date_formatter)) if 'Temperature' in filename: ax.set(xlabel='', ylabel='Temperature (deg C)') else: ax.set(xlabel='', ylabel='Power (W)') # Put the labels at an angle since they tend to be too long fig.autofmt_xdate() fig.savefig(filename) fig.clf() plt.clf()
[docs]def validation_save_metrics(df, output_dir): # Save the model performance data df.to_csv('%s/metrics.csv' % output_dir, index=False) df['disk_size'] = df['disk_size'].astype(float) df['ind'] = df.index df['Disk Size (Log)'] = np.log(df.disk_size) df['Response'] = df.response df['Type'] = df.model_type # Plot the disk size with plt.rc_context(dict(sns.axes_style("whitegrid"))): f, ax = plt.subplots(figsize=(10, 4)) newplt = sns.scatterplot(x="ind", y="Disk Size (Log)", style="Type", hue="Response", sizes=(10, 200), data=df, ax=ax).get_figure() ax.set_xlabel('Index') ax.set_ylabel('Log Disk Size (log(MB))') newplt.savefig('%s/fig_performance_disk_size.png' % (output_dir)) newplt.clf() plt.clf() # Plot the load and run times table = pd.DataFrame.pivot_table(df, index=['Type'], values=['load_time', 'run_time_8760', 'run_time_single'], aggfunc=np.average) # Convert back to a dataframe table = pd.DataFrame(table.to_records()) # table['Load Time (Log)'] = np.log(table.load_time) with plt.rc_context(dict(sns.axes_style('whitegrid'))): fig = plt.figure(figsize=(10, 4)) # Defaults to the ax in the figure. ax = sns.barplot(x='Type', y='load_time', data=table) ax.set_xlabel('Model Type') ax.set_ylabel('Average Time (seconds)') plt.tight_layout() fig.savefig('%s/fig_performance_load_time.png' % output_dir) fig.clf() plt.clf() table.rename(columns={'run_time_single': 'Run Time - Single', 'run_time_8760': 'Run Time - 8760'}, inplace=True) table.to_csv('%s/load_time_metrics.csv' % output_dir, index=False) melted_df = pd.melt(table[['Type', 'Run Time - Single', 'Run Time - 8760']], id_vars='Type') with plt.rc_context(dict(sns.axes_style('whitegrid'))): fig = plt.figure(figsize=(10, 4)) # Defaults to the ax in the figure ax = sns.barplot(x="Type", y="value", hue="variable", data=melted_df) ax.set_xlabel('Model Type') ax.set_ylabel('Average Time (seconds)') plt.tight_layout() fig.savefig('%s/fig_performance_run_time.png' % output_dir) fig.clf() plt.clf()
[docs]def validate_dataframe(df, metadata, image_save_dir): """ Take the dataframe and perform various validations and create plots :param df: Contains the actual and modeled data for various ROMs :return: """ # Create some new columns for total energy # TODO: remove hard coded response variables df['Total Heating Energy'] = df['HeatingElectricity'] + df['DistrictHeatingHotWaterEnergy'] df['Total Cooling Energy'] = df['CoolingElectricity'] + df['DistrictCoolingChilledWaterEnergy'] df['Total HVAC Energy'] = df['Total Heating Energy'] + df['Total Cooling Energy'] # Aggregate the data and break out the cooling and heating totals for model_type, model_data in metadata.items(): for response in model_data['responses']: modeled_name = "Modeled %s %s" % (model_data['moniker'], response) heating_col_name = 'Total Heating Energy %s' % model_data['moniker'] cooling_col_name = 'Total Cooling Energy %s' % model_data['moniker'] total_col_name = 'Total HVAC Energy %s' % model_data['moniker'] # calculate the total hvac energy for each model type if 'Heating' in modeled_name: if heating_col_name not in df.columns.values: # initialize the columns df[heating_col_name] = 0 df[heating_col_name] += df[modeled_name] if 'Cooling' in modeled_name: if cooling_col_name not in df.columns.values: # initialize the columns df[cooling_col_name] = 0 df[cooling_col_name] += df[modeled_name] # if 'Cooling' in modeled_name: # df['Total Cooling Energy %s' % model_data['moniker']] += df[modeled_name] # sum up the heating and cooling columns for the total energy for each model_type heating_col_name = 'Total Heating Energy %s' % model_data['moniker'] cooling_col_name = 'Total Cooling Energy %s' % model_data['moniker'] total_col_name = 'Total HVAC Energy %s' % model_data['moniker'] if total_col_name not in df.columns.values: df[total_col_name] = 0 if heating_col_name in df.columns.values: df[total_col_name] += df[heating_col_name] if cooling_col_name in df.columns.values: df[total_col_name] += df[cooling_col_name] # Run the ROM for each of the response variables errors = [] for model_type, model_data in metadata.items(): for response in model_data['responses']: modeled_name = "Modeled %s %s" % (model_data['moniker'], response) lmplot = sns.lmplot( x=response, y=modeled_name, data=df, ci=None, palette="muted", height=8, scatter_kws={"s": 50, "alpha": 1} ) fig = lmplot.fig fig.savefig( '%s/fig_validation_%s_%s.png' % (image_save_dir, response, model_data['moniker'])) fig.tight_layout() fig.clf() plt.clf() # Lag plot for each response variable plt.figure() lag_plot(df[response]) plt.savefig('%s/fig_lag_%s_%s.png' % (image_save_dir, model_data['moniker'], response)) plt.clf() sum_of_error = (df[response] - df[modeled_name]).sum() sum_square_error = ((df[response] - df[modeled_name]) ** 2).sum() nmbe = 100 * (sum_of_error / ((len(df) - 1) * df[response].mean())) cvrmse = (100 / df[response].mean()) * (sqrt(sum_square_error / (len(df) - 1))) rmse = sqrt(mean_squared_error(df[response], df[modeled_name])) errors.append( OrderedDict( [ ('response', response), ('model_type', model_type), ('rmse', rmse), ('nmbe', nmbe), ('cvrmse', cvrmse), ] ) ) # Save data to image dir, because that is the only directory that I know of right now save_dict_to_csv(errors, "%s/statistics.csv" % image_save_dir) # Convert Energy to Watts df['Total HVAC Energy'] = df['Total HVAC Energy'] / 277777.77 # One off plots melted_df = pd.melt( df[['SiteOutdoorAirDrybulbTemperature', 'Total HVAC Energy']], id_vars='SiteOutdoorAirDrybulbTemperature', var_name='Model', value_name='Energy' ) melted_df['Dummy'] = 0 filename = '%s/fig_validation_energy_actual.png' % image_save_dir validation_plot_energy_temp(melted_df, filename) all_columns = ['SiteOutdoorAirDrybulbTemperature', 'Total HVAC Energy'] for model_type, model_data in metadata.items(): # Convert to Watts df['Total HVAC Energy %s' % model_data['moniker']] = df['Total HVAC Energy %s' % model_data['moniker']] / 277777.77 all_columns.append('Total HVAC Energy %s' % model_data['moniker']) melted_df = pd.melt( df[['SiteOutdoorAirDrybulbTemperature', 'Total HVAC Energy', 'Total HVAC Energy %s' % model_data['moniker']]], id_vars='SiteOutdoorAirDrybulbTemperature', var_name='Model', value_name='Energy' ) melted_df['Dummy'] = 0 filename = '%s/fig_validation_energy_combined_%s.png' % ( image_save_dir, model_data['moniker']) validation_plot_energy_temp(melted_df, filename) # Plot energy vs. outdoor temperature for all of the responses melted_df = pd.melt( df[all_columns], id_vars='SiteOutdoorAirDrybulbTemperature', var_name='Model', value_name='Energy' ) melted_df['Dummy'] = 0 filename = '%s/fig_validation_energy_combined_all.png' % image_save_dir validation_plot_energy_temp(melted_df, filename) # Create a subselection of the data, and run some other plots sub_data = { 'Swing': df[df["DateTime"].between("2009-03-01 01:00", "2009-03-10 00:00")], 'Summer': df[df["DateTime"].between("2009-07-01 01:00", "2009-07-10 00:00")], 'Winter': df[df["DateTime"].between("2009-01-15 01:00", "2009-01-25 00:00")], } for season, season_df in sub_data.items(): # Gather a list of all the responses and the modeled column names all_responses = {} for model_type, model_data in metadata.items(): for response in model_data['responses']: modeled_name = "Modeled %s %s" % (model_data['moniker'], response) if response in all_responses.keys(): all_responses[response].append(modeled_name) else: all_responses[response] = [modeled_name] # Plot each modeled response individually for model_type, model_data in metadata.items(): for response in model_data['responses']: modeled_name = "Modeled %s %s" % (model_data['moniker'], response) if 'Temperature' not in response: # convert to watts season_df[response] = season_df[response] / 277777.77 season_df[modeled_name] = season_df[modeled_name] / 277777.77 selected_columns = ['DateTime', response, modeled_name] melted_df = pd.melt(season_df[selected_columns], id_vars='DateTime', var_name='Variable', value_name='Value') melted_df['Dummy'] = 0 filename = '%s/fig_validation_ts_%s_%s_%s.png' % ( image_save_dir, season, response, model_data['moniker']) validation_plot_timeseries(melted_df, filename) # Now plot all the modeled responses together for response, models in all_responses.items(): selected_columns = ['DateTime', response] + models melted_df = pd.melt(season_df[selected_columns], id_vars='DateTime', var_name='Variable', value_name='Value') melted_df['Dummy'] = 0 filename = '%s/fig_validation_ts_%s_%s_combined.png' % ( image_save_dir, season, response) validation_plot_timeseries(melted_df, filename)
# Plot all the modeled timeseries results on a single plot