# © Crown-owned copyright 2024, Defence Science and Technology Laboratory UK import json import sys from datetime import datetime from pathlib import Path from typing import Dict, Optional import plotly.graph_objects as go import polars as pl import yaml from plotly.graph_objs import Figure from utils import _get_system_info import primaite PLOT_CONFIG = { "size": {"auto_size": False, "width": 1500, "height": 900}, "template": "plotly_white", "range_slider": False, } def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict, config: Dict) -> dict: """ Constructs a dictionary aggregating benchmark results from multiple sessions. :param start_datetime: The datetime when the benchmarking started. :param metadata_dict: Dictionary containing metadata for each session. :param config: Configuration settings used during the benchmarking. :return: A dictionary containing aggregated data and metadata from the benchmarking sessions. """ num_sessions = len(metadata_dict) # number of sessions averaged_data = { "start_timestamp": start_datetime.isoformat(), "end_datetime": datetime.now().isoformat(), "primaite_version": primaite.__version__, "system_info": _get_system_info(), "total_sessions": num_sessions, "total_episodes": sum(d["total_episodes"] for d in metadata_dict.values()), "total_time_steps": sum(d["total_time_steps"] for d in metadata_dict.values()), "av_s_per_session": sum(d["total_s"] for d in metadata_dict.values()) / num_sessions, "av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / num_sessions, "av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values()) / num_sessions, "combined_total_reward_per_episode": {}, "session_total_reward_per_episode": {k: v["total_reward_per_episode"] for k, v in metadata_dict.items()}, "config": config, } # find the average of each episode across all sessions episodes = metadata_dict[1]["total_reward_per_episode"].keys() for episode in episodes: combined_av_reward = ( sum(metadata_dict[k]["total_reward_per_episode"][episode] for k in metadata_dict.keys()) / num_sessions ) averaged_data["combined_total_reward_per_episode"][episode] = combined_av_reward return averaged_data def _get_df_from_episode_av_reward_dict(data: Dict) -> pl.DataFrame: """ Converts a dictionary of episode average rewards into a Polars DataFrame. :param data: Dictionary with episodes as keys and average rewards as values. :return: Polars DataFrame with episodes and average rewards, including a rolling average. """ data: Dict = {"episode": data.keys(), "av_reward": data.values()} return ( pl.from_dict(data) .with_columns(rolling_mean=pl.col("av_reward").rolling_mean(window_size=25)) .rename({"rolling_mean": "rolling_av_reward"}) ) def _plot_benchmark_metadata( benchmark_metadata_dict: Dict, title: Optional[str] = None, subtitle: Optional[str] = None, ) -> Figure: """ Plots benchmark metadata as a line graph using Plotly. :param benchmark_metadata_dict: Dictionary containing the total reward per episode and session. :param title: Optional title for the graph. :param subtitle: Optional subtitle for the graph. :return: Plotly figure object representing the benchmark metadata plot. """ if title: if subtitle: title = f"{title}
{subtitle}" else: if subtitle: title = subtitle layout = go.Layout( autosize=PLOT_CONFIG["size"]["auto_size"], width=PLOT_CONFIG["size"]["width"], height=PLOT_CONFIG["size"]["height"], ) # Create the line graph with a colored line fig = go.Figure(layout=layout) fig.update_layout(template=PLOT_CONFIG["template"]) for session, av_reward_dict in benchmark_metadata_dict["session_total_reward_per_episode"].items(): df = _get_df_from_episode_av_reward_dict(av_reward_dict) fig.add_trace( go.Scatter( x=df["episode"], y=df["av_reward"], mode="lines", name=f"Session {session}", opacity=0.25, line={"color": "#a6a6a6"}, ) ) df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_total_reward_per_episode"]) fig.add_trace( go.Scatter( x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"} ) ) fig.add_trace( go.Scatter( x=df["episode"], y=df["rolling_av_reward"], mode="lines", name="Rolling Av (Combined Session Av)", line={"color": "#4CBB17"}, ) ) # Set the layout of the graph fig.update_layout( xaxis={ "title": "Episode", "type": "linear", }, yaxis={"title": "Total Reward"}, title=title, ) return fig def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure: """ Plot the Benchmark results for each released version of PrimAITE. Does this by iterating over the ``benchmark/results`` directory and extracting the benchmark metadata json for each version that has been benchmarked. The combined_total_reward_per_episode is extracted from each, converted into a polars dataframe, and plotted as a scatter line in plotly. """ major_v = primaite.__version__.split(".")[0] title = f"Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}" subtitle = "Rolling Av (Combined Session Av)" if title: if subtitle: title = f"{title}
{subtitle}" else: if subtitle: title = subtitle layout = go.Layout( autosize=PLOT_CONFIG["size"]["auto_size"], width=PLOT_CONFIG["size"]["width"], height=PLOT_CONFIG["size"]["height"], ) # Create the line graph with a colored line fig = go.Figure(layout=layout) fig.update_layout(template=PLOT_CONFIG["template"]) for dir in results_directory.iterdir(): if dir.is_dir(): metadata_file = dir / f"{dir.name}_benchmark_metadata.json" with open(metadata_file, "r") as file: metadata_dict = json.load(file) df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_total_reward_per_episode"]) fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name)) # Set the layout of the graph fig.update_layout( xaxis={ "title": "Episode", "type": "linear", }, yaxis={"title": "Total Reward"}, title=title, ) fig["data"][0]["showlegend"] = True return fig def _get_performance_benchmark_for_all_version_dict(results_directory: Path) -> Dict[str, float]: """ Gathers performance benchmarks for all versions of the software stored in a specified directory. This function iterates through each directory within the specified results directory, extracts the av_s_per_100_steps_10_nodes from the benchmark_metadata.json files, and aggregates it into a dictionary. :param results_directory: The directory containing subdirectories for each version's benchmark data. :return: A dictionary with version numbers as keys and their corresponding average performance benchmark (average time per 100 steps on 10 nodes) as values. """ performance_benchmark_dict = {} for dir in results_directory.iterdir(): if dir.is_dir(): metadata_file = dir / f"{dir.name}_benchmark_metadata.json" with open(metadata_file, "r") as file: metadata_dict = json.load(file) version = metadata_dict["primaite_version"] performance_benchmark_dict[version] = metadata_dict["av_s_per_100_steps_10_nodes"] return performance_benchmark_dict def _plot_av_s_per_100_steps_10_nodes( version_times_dict: Dict[str, float], ) -> Figure: """ Creates a bar chart visualising the performance of each version of PrimAITE. Performance is based on the average training time per 100 steps on 10 nodes. :param version_times_dict: A dictionary with software versions as keys and average times as values. :return: A Plotly figure object representing the bar chart of the performance metrics. """ major_v = primaite.__version__.split(".")[0] title = f"Performance of Minor and Bugfix Releases for Major Version {major_v}" subtitle = "Average Training Time per 100 Steps on 10 Nodes " title = f"{title}
{subtitle}" layout = go.Layout( autosize=PLOT_CONFIG["size"]["auto_size"], width=PLOT_CONFIG["size"]["width"], height=PLOT_CONFIG["size"]["height"], ) fig = go.Figure(layout=layout) fig.update_layout(template=PLOT_CONFIG["template"]) versions = sorted(list(version_times_dict.keys())) times = [version_times_dict[version] for version in versions] fig.add_trace( go.Bar( x=versions, y=times, text=times, textposition="auto", ) ) fig.update_layout( xaxis_title="PrimAITE Version", yaxis_title="Avg Time per 100 Steps on 10 Nodes (seconds)", title=title, ) return fig def build_benchmark_md_report( benchmark_start_time: datetime, session_metadata: Dict, config_path: Path, results_root_path: Path ) -> None: """ Generates a Markdown report for a benchmarking session, documenting performance metrics and graphs. This function orchestrates the creation of several graphs depicting various performance benchmarks and aggregates them into a markdown document that includes comprehensive system and benchmark information. :param benchmark_start_time: The datetime object representing when the benchmarking process was initiated. :param session_metadata: A dictionary containing metadata for each benchmarking session. :param config_path: A pathlib.Path object pointing to the configuration file used for the benchmark sessions. :param results_root_path: A pathlib.Path object pointing to the directory where the results and graphs should be saved. """ # generate report folder v_str = f"v{primaite.__version__}" version_result_dir = results_root_path / v_str version_result_dir.mkdir(exist_ok=True, parents=True) # load the config file as dict with open(config_path, "r") as f: cfg_data = yaml.safe_load(f) # generate the benchmark metadata dict benchmark_metadata_dict = _build_benchmark_results_dict( start_datetime=benchmark_start_time, metadata_dict=session_metadata, config=cfg_data ) major_v = primaite.__version__.split(".")[0] with open(version_result_dir / f"{v_str}_benchmark_metadata.json", "w") as file: json.dump(benchmark_metadata_dict, file, indent=4) title = f"PrimAITE v{primaite.__version__.strip()} Learning Benchmark" fig = _plot_benchmark_metadata(benchmark_metadata_dict, title=title) this_version_plot_path = version_result_dir / f"{title}.png" fig.write_image(this_version_plot_path) fig = _plot_all_benchmarks_combined_session_av(results_directory=results_root_path) filename = f"PrimAITE Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}.png" all_version_plot_path = version_result_dir / filename fig.write_image(all_version_plot_path) performance_benchmark_dict = _get_performance_benchmark_for_all_version_dict(results_directory=results_root_path) fig = _plot_av_s_per_100_steps_10_nodes(performance_benchmark_dict) filename = f"PrimAITE Performance of Minor and Bugfix Releases for Major Version {major_v}.png" performance_benchmark_plot_path = version_result_dir / filename fig.write_image(performance_benchmark_plot_path) data = benchmark_metadata_dict primaite_version = data["primaite_version"] with open(version_result_dir / f"PrimAITE v{primaite_version} Benchmark Report.md", "w") as file: # Title file.write(f"# PrimAITE v{primaite_version} Learning Benchmark\n") file.write("## PrimAITE Dev Team\n") file.write(f"### {datetime.now().date()}\n") file.write("\n---\n") sessions = data["total_sessions"] episodes = session_metadata[1]["total_episodes"] - 1 steps = data["config"]["game"]["max_episode_length"] # Body file.write("## 1 Introduction\n") file.write( f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics " f"were captured to be referenced during system-level testing and user acceptance testing (UAT).\n" ) file.write( f"The benchmarking process consists of running {sessions} training session using the same " f"config file. Each session trains an agent for {episodes} episodes, " f"with each episode consisting of {steps} steps.\n" ) file.write( f"The total reward per episode from each session is captured. This is then used to calculate an " f"caverage total reward per episode from the {sessions} individual sessions for smoothing. " f"Finally, a 25-widow rolling average of the average total reward per session is calculated for " f"further smoothing.\n" ) file.write("## 2 System Information\n") i = 1 file.write(f"### 2.{i} Python\n") file.write(f"**Version:** {sys.version}\n") for section, section_data in data["system_info"].items(): i += 1 if section_data: file.write(f"### 2.{i} {section}\n") if isinstance(section_data, dict): for key, value in section_data.items(): file.write(f"- **{key}:** {value}\n") headers_map = { "total_sessions": "Total Sessions", "total_episodes": "Total Episodes", "total_time_steps": "Total Steps", "av_s_per_session": "Av Session Duration (s)", "av_s_per_step": "Av Step Duration (s)", "av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)", } file.write("## 3 Stats\n") for section, header in headers_map.items(): if section.startswith("av_"): file.write(f"- **{header}:** {data[section]:.4f}\n") else: file.write(f"- **{header}:** {data[section]}\n") file.write("## 4 Graphs\n") file.write(f"### 4.1 v{primaite_version} Learning Benchmark Plot\n") file.write(f"![PrimAITE {primaite_version} Learning Benchmark Plot]({this_version_plot_path.name})\n") file.write(f"### 4.2 Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}\n") file.write( f"![Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}]" f"({all_version_plot_path.name})\n" ) file.write(f"### 4.3 Performance of Minor and Bugfix Releases for Major Version {major_v}\n") file.write( f"![Performance of Minor and Bugfix Releases for Major Version {major_v}]" f"({performance_benchmark_plot_path.name})\n" )