Files
PrimAITE/benchmark/report.py

396 lines
15 KiB
Python

# © Crown-owned copyright 2024, Defence Science and Technology Laboratory UK
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional
import plotly.graph_objects as go
import polars as pl
import yaml
from plotly.graph_objs import Figure
from utils import _get_system_info
import primaite
PLOT_CONFIG = {
"size": {"auto_size": False, "width": 1500, "height": 900},
"template": "plotly_white",
"range_slider": False,
}
def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict, config: Dict) -> dict:
"""
Constructs a dictionary aggregating benchmark results from multiple sessions.
:param start_datetime: The datetime when the benchmarking started.
:param metadata_dict: Dictionary containing metadata for each session.
:param config: Configuration settings used during the benchmarking.
:return: A dictionary containing aggregated data and metadata from the benchmarking sessions.
"""
num_sessions = len(metadata_dict) # number of sessions
averaged_data = {
"start_timestamp": start_datetime.isoformat(),
"end_datetime": datetime.now().isoformat(),
"primaite_version": primaite.__version__,
"system_info": _get_system_info(),
"total_sessions": num_sessions,
"total_episodes": sum(d["total_episodes"] for d in metadata_dict.values()),
"total_time_steps": sum(d["total_time_steps"] for d in metadata_dict.values()),
"av_s_per_session": sum(d["total_s"] for d in metadata_dict.values()) / num_sessions,
"av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / num_sessions,
"av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values())
/ num_sessions,
"combined_total_reward_per_episode": {},
"session_total_reward_per_episode": {k: v["total_reward_per_episode"] for k, v in metadata_dict.items()},
"config": config,
}
# find the average of each episode across all sessions
episodes = metadata_dict[1]["total_reward_per_episode"].keys()
for episode in episodes:
combined_av_reward = (
sum(metadata_dict[k]["total_reward_per_episode"][episode] for k in metadata_dict.keys()) / num_sessions
)
averaged_data["combined_total_reward_per_episode"][episode] = combined_av_reward
return averaged_data
def _get_df_from_episode_av_reward_dict(data: Dict) -> pl.DataFrame:
"""
Converts a dictionary of episode average rewards into a Polars DataFrame.
:param data: Dictionary with episodes as keys and average rewards as values.
:return: Polars DataFrame with episodes and average rewards, including a rolling average.
"""
data: Dict = {"episode": data.keys(), "av_reward": data.values()}
return (
pl.from_dict(data)
.with_columns(rolling_mean=pl.col("av_reward").rolling_mean(window_size=25))
.rename({"rolling_mean": "rolling_av_reward"})
)
def _plot_benchmark_metadata(
benchmark_metadata_dict: Dict,
title: Optional[str] = None,
subtitle: Optional[str] = None,
) -> Figure:
"""
Plots benchmark metadata as a line graph using Plotly.
:param benchmark_metadata_dict: Dictionary containing the total reward per episode and session.
:param title: Optional title for the graph.
:param subtitle: Optional subtitle for the graph.
:return: Plotly figure object representing the benchmark metadata plot.
"""
if title:
if subtitle:
title = f"{title} <br>{subtitle}</sup>"
else:
if subtitle:
title = subtitle
layout = go.Layout(
autosize=PLOT_CONFIG["size"]["auto_size"],
width=PLOT_CONFIG["size"]["width"],
height=PLOT_CONFIG["size"]["height"],
)
# Create the line graph with a colored line
fig = go.Figure(layout=layout)
fig.update_layout(template=PLOT_CONFIG["template"])
for session, av_reward_dict in benchmark_metadata_dict["session_total_reward_per_episode"].items():
df = _get_df_from_episode_av_reward_dict(av_reward_dict)
fig.add_trace(
go.Scatter(
x=df["episode"],
y=df["av_reward"],
mode="lines",
name=f"Session {session}",
opacity=0.25,
line={"color": "#a6a6a6"},
)
)
df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_total_reward_per_episode"])
fig.add_trace(
go.Scatter(
x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"}
)
)
fig.add_trace(
go.Scatter(
x=df["episode"],
y=df["rolling_av_reward"],
mode="lines",
name="Rolling Av (Combined Session Av)",
line={"color": "#4CBB17"},
)
)
# Set the layout of the graph
fig.update_layout(
xaxis={
"title": "Episode",
"type": "linear",
},
yaxis={"title": "Total Reward"},
title=title,
)
return fig
def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
"""
Plot the Benchmark results for each released version of PrimAITE.
Does this by iterating over the ``benchmark/results`` directory and
extracting the benchmark metadata json for each version that has been
benchmarked. The combined_total_reward_per_episode is extracted from each,
converted into a polars dataframe, and plotted as a scatter line in plotly.
"""
major_v = primaite.__version__.split(".")[0]
title = f"Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}"
subtitle = "Rolling Av (Combined Session Av)"
if title:
if subtitle:
title = f"{title} <br>{subtitle}</sup>"
else:
if subtitle:
title = subtitle
layout = go.Layout(
autosize=PLOT_CONFIG["size"]["auto_size"],
width=PLOT_CONFIG["size"]["width"],
height=PLOT_CONFIG["size"]["height"],
)
# Create the line graph with a colored line
fig = go.Figure(layout=layout)
fig.update_layout(template=PLOT_CONFIG["template"])
for dir in results_directory.iterdir():
if dir.is_dir():
metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
with open(metadata_file, "r") as file:
metadata_dict = json.load(file)
df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_total_reward_per_episode"])
fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name))
# Set the layout of the graph
fig.update_layout(
xaxis={
"title": "Episode",
"type": "linear",
},
yaxis={"title": "Total Reward"},
title=title,
)
fig["data"][0]["showlegend"] = True
return fig
def _get_performance_benchmark_for_all_version_dict(results_directory: Path) -> Dict[str, float]:
"""
Gathers performance benchmarks for all versions of the software stored in a specified directory.
This function iterates through each directory within the specified results directory,
extracts the av_s_per_100_steps_10_nodes from the benchmark_metadata.json files, and aggregates it into a
dictionary.
:param results_directory: The directory containing subdirectories for each version's benchmark data.
:return: A dictionary with version numbers as keys and their corresponding average performance benchmark
(average time per 100 steps on 10 nodes) as values.
"""
performance_benchmark_dict = {}
for dir in results_directory.iterdir():
if dir.is_dir():
metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
with open(metadata_file, "r") as file:
metadata_dict = json.load(file)
version = metadata_dict["primaite_version"]
performance_benchmark_dict[version] = metadata_dict["av_s_per_100_steps_10_nodes"]
return performance_benchmark_dict
def _plot_av_s_per_100_steps_10_nodes(
version_times_dict: Dict[str, float],
) -> Figure:
"""
Creates a bar chart visualising the performance of each version of PrimAITE.
Performance is based on the average training time per 100 steps on 10 nodes.
:param version_times_dict: A dictionary with software versions as keys and average times as values.
:return: A Plotly figure object representing the bar chart of the performance metrics.
"""
major_v = primaite.__version__.split(".")[0]
title = f"Performance of Minor and Bugfix Releases for Major Version {major_v}"
subtitle = "Average Training Time per 100 Steps on 10 Nodes "
title = f"{title} <br><sub>{subtitle}</sub>"
layout = go.Layout(
autosize=PLOT_CONFIG["size"]["auto_size"],
width=PLOT_CONFIG["size"]["width"],
height=PLOT_CONFIG["size"]["height"],
)
fig = go.Figure(layout=layout)
fig.update_layout(template=PLOT_CONFIG["template"])
versions = sorted(list(version_times_dict.keys()))
times = [version_times_dict[version] for version in versions]
fig.add_trace(
go.Bar(
x=versions,
y=times,
text=times,
textposition="auto",
)
)
fig.update_layout(
xaxis_title="PrimAITE Version",
yaxis_title="Avg Time per 100 Steps on 10 Nodes (seconds)",
title=title,
)
return fig
def build_benchmark_md_report(
benchmark_start_time: datetime, session_metadata: Dict, config_path: Path, results_root_path: Path
) -> None:
"""
Generates a Markdown report for a benchmarking session, documenting performance metrics and graphs.
This function orchestrates the creation of several graphs depicting various performance benchmarks and aggregates
them into a markdown document that includes comprehensive system and benchmark information.
:param benchmark_start_time: The datetime object representing when the benchmarking process was initiated.
:param session_metadata: A dictionary containing metadata for each benchmarking session.
:param config_path: A pathlib.Path object pointing to the configuration file used for the benchmark sessions.
:param results_root_path: A pathlib.Path object pointing to the directory where the results and graphs should be
saved.
"""
# generate report folder
v_str = f"v{primaite.__version__}"
version_result_dir = results_root_path / v_str
version_result_dir.mkdir(exist_ok=True, parents=True)
# load the config file as dict
with open(config_path, "r") as f:
cfg_data = yaml.safe_load(f)
# generate the benchmark metadata dict
benchmark_metadata_dict = _build_benchmark_results_dict(
start_datetime=benchmark_start_time, metadata_dict=session_metadata, config=cfg_data
)
major_v = primaite.__version__.split(".")[0]
with open(version_result_dir / f"{v_str}_benchmark_metadata.json", "w") as file:
json.dump(benchmark_metadata_dict, file, indent=4)
title = f"PrimAITE v{primaite.__version__.strip()} Learning Benchmark"
fig = _plot_benchmark_metadata(benchmark_metadata_dict, title=title)
this_version_plot_path = version_result_dir / f"{title}.png"
fig.write_image(this_version_plot_path)
fig = _plot_all_benchmarks_combined_session_av(results_directory=results_root_path)
filename = f"PrimAITE Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}.png"
all_version_plot_path = version_result_dir / filename
fig.write_image(all_version_plot_path)
performance_benchmark_dict = _get_performance_benchmark_for_all_version_dict(results_directory=results_root_path)
fig = _plot_av_s_per_100_steps_10_nodes(performance_benchmark_dict)
filename = f"PrimAITE Performance of Minor and Bugfix Releases for Major Version {major_v}.png"
performance_benchmark_plot_path = version_result_dir / filename
fig.write_image(performance_benchmark_plot_path)
data = benchmark_metadata_dict
primaite_version = data["primaite_version"]
with open(version_result_dir / f"PrimAITE v{primaite_version} Benchmark Report.md", "w") as file:
# Title
file.write(f"# PrimAITE v{primaite_version} Learning Benchmark\n")
file.write("## PrimAITE Dev Team\n")
file.write(f"### {datetime.now().date()}\n")
file.write("\n---\n")
sessions = data["total_sessions"]
episodes = session_metadata[1]["total_episodes"] - 1
steps = data["config"]["game"]["max_episode_length"]
# Body
file.write("## 1 Introduction\n")
file.write(
f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics "
f"were captured to be referenced during system-level testing and user acceptance testing (UAT).\n"
)
file.write(
f"The benchmarking process consists of running {sessions} training session using the same "
f"config file. Each session trains an agent for {episodes} episodes, "
f"with each episode consisting of {steps} steps.\n"
)
file.write(
f"The total reward per episode from each session is captured. This is then used to calculate an "
f"caverage total reward per episode from the {sessions} individual sessions for smoothing. "
f"Finally, a 25-widow rolling average of the average total reward per session is calculated for "
f"further smoothing.\n"
)
file.write("## 2 System Information\n")
i = 1
file.write(f"### 2.{i} Python\n")
file.write(f"**Version:** {sys.version}\n")
for section, section_data in data["system_info"].items():
i += 1
if section_data:
file.write(f"### 2.{i} {section}\n")
if isinstance(section_data, dict):
for key, value in section_data.items():
file.write(f"- **{key}:** {value}\n")
headers_map = {
"total_sessions": "Total Sessions",
"total_episodes": "Total Episodes",
"total_time_steps": "Total Steps",
"av_s_per_session": "Av Session Duration (s)",
"av_s_per_step": "Av Step Duration (s)",
"av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)",
}
file.write("## 3 Stats\n")
for section, header in headers_map.items():
if section.startswith("av_"):
file.write(f"- **{header}:** {data[section]:.4f}\n")
else:
file.write(f"- **{header}:** {data[section]}\n")
file.write("## 4 Graphs\n")
file.write(f"### 4.1 v{primaite_version} Learning Benchmark Plot\n")
file.write(f"![PrimAITE {primaite_version} Learning Benchmark Plot]({this_version_plot_path.name})\n")
file.write(f"### 4.2 Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}\n")
file.write(
f"![Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}]"
f"({all_version_plot_path.name})\n"
)
file.write(f"### 4.3 Performance of Minor and Bugfix Releases for Major Version {major_v}\n")
file.write(
f"![Performance of Minor and Bugfix Releases for Major Version {major_v}]"
f"({performance_benchmark_plot_path.name})\n"
)