retrospectively benchmarked 3.0.0 using the new benchmark process

This commit is contained in:
Chris McCarthy
2024-07-20 15:05:56 +01:00
parent 3eacad200e
commit 213e301e58
8 changed files with 329 additions and 99 deletions

View File

@@ -0,0 +1,108 @@
trigger:
- release/*
schedules:
- cron: "0 2 * * 1-5" # Run at 2 AM every weekday
displayName: "Weekday Schedule"
branches:
include:
- 'refs/heads/dev'
variables:
VERSION: ''
MAJOR_VERSION: ''
jobs:
- job: PrimAITE_Benchmark
timeoutInMinutes: 360 # 6-hour maximum
pool:
name: 'Imaginary Yak Pool'
workspace:
clean: all
steps:
- checkout: self
persistCredentials: true
- script: |
python3.10 -m venv venv
displayName: 'Create venv'
- script: |
VERSION=$(cat src/primaite/VERSION | tr -d '\n')
if [[ "$(Build.SourceBranch)" == "refs/heads/dev" ]]; then
DATE=$(date +%Y%m%d)
echo "${VERSION}+dev.${DATE}" > src/primaite/VERSION
fi
displayName: 'Update VERSION file for Dev Benchmark'
- script: |
VERSION=$(cat src/primaite/VERSION | tr -d '\n')
MAJOR_VERSION=$(echo $VERSION | cut -d. -f1)
echo "##vso[task.setvariable variable=VERSION]$VERSION"
echo "##vso[task.setvariable variable=MAJOR_VERSION]$MAJOR_VERSION"
displayName: 'Set Version Variables'
- script: |
source venv/bin/activate
pip install --upgrade pip
pip install -e .[dev,rl]
primaite setup
displayName: 'Install Dependencies'
- script: |
set -e
source venv/bin/activate
cd benchmark
python primaite_benchmark.py
cd ..
displayName: 'Run Benchmarking Script'
- script: |
tar czf primaite_v$(VERSION)_benchmark.tar.gz benchmark/results/v$(MAJOR_VERSION)/v$(VERSION)
displayName: 'Prepare Artifacts for Publishing'
- task: PublishPipelineArtifact@1
inputs:
targetPath: primaite_v$(VERSION)_benchmark.tar.gz
artifactName: 'benchmark-zip-output'
publishLocation: 'pipeline'
displayName: 'Publish Benchmark Output zip as Artifact'
- script: |
git config --global user.email "oss@dstl.gov.uk"
git config --global user.name "Defence Science and Technology Laboratory UK"
workingDirectory: $(System.DefaultWorkingDirectory)
displayName: 'Configure Git'
condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/heads/release'))
- script: |
echo "Fetching all branches..."
git fetch --all --prune
echo "Stashing files..."
git stash push -u
echo "Resolving branch name..."
# Extracting just the branch name from the full ref path
branch_name=$(echo "$(Build.SourceBranch)" | sed 's|refs/heads/||')
echo "Branch Name: $branch_name"
echo "Checking out branch $branch_name..."
git checkout $branch_name
echo "Popping stash..."
git stash pop
echo "Adding benchmark results..."
git add benchmark/results/v$(MAJOR_VERSION)/v$(VERSION)/*
echo "Committing changes..."
git commit -m "Automated benchmark output commit for version $(VERSION) [skip ci]"
echo "Pushing to remote..."
git push origin $branch_name
displayName: 'Commit and Push Benchmark Results'
workingDirectory: $(System.DefaultWorkingDirectory)
env:
GIT_CREDENTIALS: $(System.AccessToken)
condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/heads/release'))

View File

@@ -1,3 +1,4 @@
# © Crown-owned copyright 2024, Defence Science and Technology Laboratory UK
from typing import Any, Dict, Optional, Tuple
from gymnasium.core import ObsType

View File

@@ -1,11 +1,11 @@
# © Crown-owned copyright 2023, Defence Science and Technology Laboratory UK
# © Crown-owned copyright 2024, Defence Science and Technology Laboratory UK
import json
import shutil
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Final, Tuple
from report import build_benchmark_latex_report
from report import build_benchmark_md_report
from stable_baselines3 import PPO
import primaite
@@ -117,14 +117,14 @@ class BenchmarkSession:
def generate_learn_metadata_dict(self) -> Dict[str, Any]:
"""Metadata specific to the learning session."""
total_s, s_per_step, s_per_100_steps_10_nodes = self._learn_benchmark_durations()
self.gym_env.average_reward_per_episode.pop(0) # remove episode 0
self.gym_env.total_reward_per_episode.pop(0) # remove episode 0
return {
"total_episodes": self.gym_env.episode_counter,
"total_time_steps": self.gym_env.total_time_steps,
"total_s": total_s,
"s_per_step": s_per_step,
"s_per_100_steps_10_nodes": s_per_100_steps_10_nodes,
"av_reward_per_episode": self.gym_env.average_reward_per_episode,
"total_reward_per_episode": self.gym_env.total_reward_per_episode,
}
@@ -151,8 +151,8 @@ def _prepare_session_directory():
def run(
number_of_sessions: int = 2,
num_episodes: int = 5,
number_of_sessions: int = 5,
num_episodes: int = 1000,
episode_len: int = 128,
n_steps: int = 1280,
batch_size: int = 32,
@@ -188,7 +188,7 @@ def run(
with open(_SESSION_METADATA_ROOT / f"{i}.json", "r") as file:
session_metadata_dict[i] = json.load(file)
# generate report
build_benchmark_latex_report(
build_benchmark_md_report(
benchmark_start_time=benchmark_start_time,
session_metadata=session_metadata_dict,
config_path=data_manipulation_config_path(),

View File

@@ -1,4 +1,4 @@
# © Crown-owned copyright 2023, Defence Science and Technology Laboratory UK
# © Crown-owned copyright 2024, Defence Science and Technology Laboratory UK
import json
import sys
from datetime import datetime
@@ -9,10 +9,6 @@ import plotly.graph_objects as go
import polars as pl
import yaml
from plotly.graph_objs import Figure
from pylatex import Command, Document
from pylatex import Figure as LatexFigure
from pylatex import Section, Subsection, Tabular
from pylatex.utils import bold
from utils import _get_system_info
import primaite
@@ -21,10 +17,20 @@ PLOT_CONFIG = {
"size": {"auto_size": False, "width": 1500, "height": 900},
"template": "plotly_white",
"range_slider": False,
"av_s_per_100_steps_10_nodes_benchmark_threshold": 5,
"benchmark_line_color": "grey",
}
def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict, config: Dict) -> dict:
"""
Constructs a dictionary aggregating benchmark results from multiple sessions.
:param start_datetime: The datetime when the benchmarking started.
:param metadata_dict: Dictionary containing metadata for each session.
:param config: Configuration settings used during the benchmarking.
:return: A dictionary containing aggregated data and metadata from the benchmarking sessions.
"""
num_sessions = len(metadata_dict) # number of sessions
averaged_data = {
@@ -39,24 +45,30 @@ def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict,
"av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / num_sessions,
"av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values())
/ num_sessions,
"combined_av_reward_per_episode": {},
"session_av_reward_per_episode": {k: v["av_reward_per_episode"] for k, v in metadata_dict.items()},
"combined_total_reward_per_episode": {},
"session_total_reward_per_episode": {k: v["total_reward_per_episode"] for k, v in metadata_dict.items()},
"config": config,
}
# find the average of each episode across all sessions
episodes = metadata_dict[1]["av_reward_per_episode"].keys()
episodes = metadata_dict[1]["total_reward_per_episode"].keys()
for episode in episodes:
combined_av_reward = (
sum(metadata_dict[k]["av_reward_per_episode"][episode] for k in metadata_dict.keys()) / num_sessions
sum(metadata_dict[k]["total_reward_per_episode"][episode] for k in metadata_dict.keys()) / num_sessions
)
averaged_data["combined_av_reward_per_episode"][episode] = combined_av_reward
averaged_data["combined_total_reward_per_episode"][episode] = combined_av_reward
return averaged_data
def _get_df_from_episode_av_reward_dict(data: Dict) -> pl.DataFrame:
"""
Converts a dictionary of episode average rewards into a Polars DataFrame.
:param data: Dictionary with episodes as keys and average rewards as values.
:return: Polars DataFrame with episodes and average rewards, including a rolling average.
"""
data: Dict = {"episode": data.keys(), "av_reward": data.values()}
return (
@@ -71,6 +83,14 @@ def _plot_benchmark_metadata(
title: Optional[str] = None,
subtitle: Optional[str] = None,
) -> Figure:
"""
Plots benchmark metadata as a line graph using Plotly.
:param benchmark_metadata_dict: Dictionary containing the total reward per episode and session.
:param title: Optional title for the graph.
:param subtitle: Optional subtitle for the graph.
:return: Plotly figure object representing the benchmark metadata plot.
"""
if title:
if subtitle:
title = f"{title} <br>{subtitle}</sup>"
@@ -87,7 +107,7 @@ def _plot_benchmark_metadata(
fig = go.Figure(layout=layout)
fig.update_layout(template=PLOT_CONFIG["template"])
for session, av_reward_dict in benchmark_metadata_dict["session_av_reward_per_episode"].items():
for session, av_reward_dict in benchmark_metadata_dict["session_total_reward_per_episode"].items():
df = _get_df_from_episode_av_reward_dict(av_reward_dict)
fig.add_trace(
go.Scatter(
@@ -100,7 +120,7 @@ def _plot_benchmark_metadata(
)
)
df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_av_reward_per_episode"])
df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_total_reward_per_episode"])
fig.add_trace(
go.Scatter(
x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"}
@@ -136,11 +156,11 @@ def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
Does this by iterating over the ``benchmark/results`` directory and
extracting the benchmark metadata json for each version that has been
benchmarked. The combined_av_reward_per_episode is extracted from each,
benchmarked. The combined_total_reward_per_episode is extracted from each,
converted into a polars dataframe, and plotted as a scatter line in plotly.
"""
major_v = primaite.__version__.split(".")[0]
title = f"Learning Benchmarking of All Released Versions under Major v{major_v}.*.*"
title = f"Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}"
subtitle = "Rolling Av (Combined Session Av)"
if title:
if subtitle:
@@ -162,7 +182,7 @@ def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
with open(metadata_file, "r") as file:
metadata_dict = json.load(file)
df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_av_reward_per_episode"])
df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_total_reward_per_episode"])
fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name))
@@ -180,10 +200,118 @@ def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
return fig
def build_benchmark_latex_report(
def _get_performance_benchmark_for_all_version_dict(results_directory: Path) -> Dict[str, float]:
"""
Gathers performance benchmarks for all versions of the software stored in a specified directory.
This function iterates through each directory within the specified results directory,
extracts the av_s_per_100_steps_10_nodes from the benchmark_metadata.json files, and aggregates it into a
dictionary.
:param results_directory: The directory containing subdirectories for each version's benchmark data.
:return: A dictionary with version numbers as keys and their corresponding average performance benchmark
(average time per 100 steps on 10 nodes) as values.
"""
performance_benchmark_dict = {}
for dir in results_directory.iterdir():
if dir.is_dir():
metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
with open(metadata_file, "r") as file:
metadata_dict = json.load(file)
version = metadata_dict["primaite_version"]
performance_benchmark_dict[version] = metadata_dict["av_s_per_100_steps_10_nodes"]
return performance_benchmark_dict
def _plot_av_s_per_100_steps_10_nodes(
version_times_dict: Dict[str, float],
) -> Figure:
"""
Creates a bar chart visualising the performance of each version of PrimAITE.
Performance is based on the average training time per 100 steps on 10 nodes. The function also includes a benchmark
line indicating the target maximum time.
Versions that perform under this time are marked in green, and those over are marked in red.
:param version_times_dict: A dictionary with software versions as keys and average times as values.
:return: A Plotly figure object representing the bar chart of the performance metrics.
"""
major_v = primaite.__version__.split(".")[0]
title = f"Performance of Minor and Bugfix Releases for Major Version {major_v}"
subtitle = (
f"Average Training Time per 100 Steps on 10 Nodes "
f"(target: <= {PLOT_CONFIG['av_s_per_100_steps_10_nodes_benchmark_threshold']} seconds)"
)
title = f"{title} <br><sub>{subtitle}</sub>"
layout = go.Layout(
autosize=PLOT_CONFIG["size"]["auto_size"],
width=PLOT_CONFIG["size"]["width"],
height=PLOT_CONFIG["size"]["height"],
)
fig = go.Figure(layout=layout)
fig.update_layout(template=PLOT_CONFIG["template"])
versions = sorted(list(version_times_dict.keys()))
times = [version_times_dict[version] for version in versions]
av_s_per_100_steps_10_nodes_benchmark_threshold = PLOT_CONFIG["av_s_per_100_steps_10_nodes_benchmark_threshold"]
benchmark_line_color = PLOT_CONFIG["benchmark_line_color"]
# Calculate the appropriate maximum y-axis value
max_y_axis_value = max(max(times), av_s_per_100_steps_10_nodes_benchmark_threshold) + 1
fig.add_trace(
go.Bar(
x=versions,
y=times,
marker_color=[
"green" if time < av_s_per_100_steps_10_nodes_benchmark_threshold else "red" for time in times
],
text=times,
textposition="auto",
)
)
# Add a horizontal line for the benchmark
fig.add_shape(
type="line",
x0=-0.5, # start slightly before the first bar
x1=len(versions) - 0.5, # end slightly after the last bar
y0=av_s_per_100_steps_10_nodes_benchmark_threshold,
y1=av_s_per_100_steps_10_nodes_benchmark_threshold,
line=dict(
color=benchmark_line_color,
width=2,
dash="dot",
),
)
fig.update_layout(
xaxis_title="PrimAITE Version",
yaxis_title="Avg Time per 100 Steps on 10 Nodes (seconds)",
yaxis=dict(range=[0, max_y_axis_value]),
title=title,
)
return fig
def build_benchmark_md_report(
benchmark_start_time: datetime, session_metadata: Dict, config_path: Path, results_root_path: Path
) -> None:
"""Generates a latex report of the benchmark run."""
"""
Generates a Markdown report for a benchmarking session, documenting performance metrics and graphs.
This function orchestrates the creation of several graphs depicting various performance benchmarks and aggregates
them into a markdown document that includes comprehensive system and benchmark information.
:param benchmark_start_time: The datetime object representing when the benchmarking process was initiated.
:param session_metadata: A dictionary containing metadata for each benchmarking session.
:param config_path: A pathlib.Path object pointing to the configuration file used for the benchmark sessions.
:param results_root_path: A pathlib.Path object pointing to the directory where the results and graphs should be
saved.
"""
# generate report folder
v_str = f"v{primaite.__version__}"
@@ -208,69 +336,61 @@ def build_benchmark_latex_report(
fig = _plot_all_benchmarks_combined_session_av(results_directory=results_root_path)
all_version_plot_path = results_root_path / "PrimAITE Versions Learning Benchmark.png"
filename = f"PrimAITE Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}.png"
all_version_plot_path = version_result_dir / filename
fig.write_image(all_version_plot_path)
geometry_options = {"tmargin": "2.5cm", "rmargin": "2.5cm", "bmargin": "2.5cm", "lmargin": "2.5cm"}
performance_benchmark_dict = _get_performance_benchmark_for_all_version_dict(results_directory=results_root_path)
fig = _plot_av_s_per_100_steps_10_nodes(performance_benchmark_dict)
filename = f"PrimAITE Performance of Minor and Bugfix Releases for Major Version {major_v}.png"
performance_benchmark_plot_path = version_result_dir / filename
fig.write_image(performance_benchmark_plot_path)
data = benchmark_metadata_dict
primaite_version = data["primaite_version"]
# Create a new document
doc = Document("report", geometry_options=geometry_options)
with open(version_result_dir / f"PrimAITE v{primaite_version} Benchmark Report.md", "w") as file:
# Title
doc.preamble.append(Command("title", f"PrimAITE {primaite_version} Learning Benchmark"))
doc.preamble.append(Command("author", "PrimAITE Dev Team"))
doc.preamble.append(Command("date", datetime.now().date()))
doc.append(Command("maketitle"))
file.write(f"# PrimAITE v{primaite_version} Learning Benchmark\n")
file.write("## PrimAITE Dev Team\n")
file.write(f"### {datetime.now().date()}\n")
file.write("\n---\n")
sessions = data["total_sessions"]
episodes = session_metadata[1]["total_episodes"] - 1
steps = data["config"]["game"]["max_episode_length"]
# Body
with doc.create(Section("Introduction")):
doc.append(
file.write("## 1 Introduction\n")
file.write(
f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics "
f"were captured to be referenced during system-level testing and user acceptance testing (UAT)."
f"were captured to be referenced during system-level testing and user acceptance testing (UAT).\n"
)
doc.append(
f"\nThe benchmarking process consists of running {sessions} training session using the same "
file.write(
f"The benchmarking process consists of running {sessions} training session using the same "
f"config file. Each session trains an agent for {episodes} episodes, "
f"with each episode consisting of {steps} steps."
f"with each episode consisting of {steps} steps.\n"
)
doc.append(
f"\nThe total reward per episode from each session is captured. This is then used to calculate an "
file.write(
f"The total reward per episode from each session is captured. This is then used to calculate an "
f"caverage total reward per episode from the {sessions} individual sessions for smoothing. "
f"Finally, a 25-widow rolling average of the average total reward per session is calculated for "
f"further smoothing."
f"further smoothing.\n"
)
with doc.create(Section("System Information")):
with doc.create(Subsection("Python")):
with doc.create(Tabular("|l|l|")) as table:
table.add_hline()
table.add_row((bold("Version"), sys.version))
table.add_hline()
file.write("## 2 System Information\n")
i = 1
file.write(f"### 2.{i} Python\n")
file.write(f"**Version:** {sys.version}\n")
for section, section_data in data["system_info"].items():
i += 1
if section_data:
with doc.create(Subsection(section)):
file.write(f"### 2.{i} {section}\n")
if isinstance(section_data, dict):
with doc.create(Tabular("|l|l|")) as table:
table.add_hline()
for key, value in section_data.items():
table.add_row((bold(key), value))
table.add_hline()
elif isinstance(section_data, list):
headers = section_data[0].keys()
tabs_str = "|".join(["l" for _ in range(len(headers))])
tabs_str = f"|{tabs_str}|"
with doc.create(Tabular(tabs_str)) as table:
table.add_hline()
table.add_row([bold(h) for h in headers])
table.add_hline()
for item in section_data:
table.add_row(item.values())
table.add_hline()
file.write(f"- **{key}:** {value}\n")
headers_map = {
"total_sessions": "Total Sessions",
@@ -280,26 +400,27 @@ def build_benchmark_latex_report(
"av_s_per_step": "Av Step Duration (s)",
"av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)",
}
with doc.create(Section("Stats")):
with doc.create(Subsection("Benchmark Results")):
with doc.create(Tabular("|l|l|")) as table:
table.add_hline()
file.write("## 3 Stats\n")
for section, header in headers_map.items():
if section.startswith("av_"):
table.add_row((bold(header), f"{data[section]:.4f}"))
file.write(f"- **{header}:** {data[section]:.4f}\n")
else:
table.add_row((bold(header), data[section]))
table.add_hline()
file.write(f"- **{header}:** {data[section]}\n")
with doc.create(Section("Graphs")):
with doc.create(Subsection(f"v{primaite_version} Learning Benchmark Plot")):
with doc.create(LatexFigure(position="h!")) as pic:
pic.add_image(str(this_version_plot_path))
pic.add_caption(f"PrimAITE {primaite_version} Learning Benchmark Plot")
file.write("## 4 Graphs\n")
with doc.create(Subsection(f"Learning Benchmarking of All Released Versions under Major v{major_v}.*.*")):
with doc.create(LatexFigure(position="h!")) as pic:
pic.add_image(str(all_version_plot_path))
pic.add_caption(f"Learning Benchmarking of All Released Versions under Major v{major_v}.*.*")
file.write(f"### 4.1 v{primaite_version} Learning Benchmark Plot\n")
file.write(f"![PrimAITE {primaite_version} Learning Benchmark Plot]({this_version_plot_path.name})\n")
doc.generate_pdf(str(this_version_plot_path).replace(".png", ""), clean_tex=True)
file.write(f"### 4.2 Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}\n")
file.write(
f"![Learning Benchmark of Minor and Bugfix Releases for Major Version {major_v}]"
f"({all_version_plot_path.name})\n"
)
file.write(f"### 4.3 Performance of Minor and Bugfix Releases for Major Version {major_v}\n")
file.write(
f"![Performance of Minor and Bugfix Releases for Major Version {major_v}]"
f"({performance_benchmark_plot_path.name})\n"
)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 295 KiB

View File

@@ -1,4 +1,4 @@
# © Crown-owned copyright 2023, Defence Science and Technology Laboratory UK
# © Crown-owned copyright 2024, Defence Science and Technology Laboratory UK
import platform
from typing import Dict