Merged PR 437: Automate Benchmarking Process with Scheduled and Release Triggers
## Summary
This pull request introduces an Azure DevOps pipeline that automates the benchmarking process for our project. The pipeline is designed to run under specific conditions and ensures that the benchmarking results are committed and published as artifacts. Additionally, the pipeline generates detailed benchmark reports in markdown format.
**Key Features:**
1. **Triggering Conditions:**
- Triggered on creation of branches matching `refs/heads/release/*`.
- Runs automatically at 2 AM every weekday for the `dev` branch.
2. **Version Handling:**
- Updates the version number with a `+dev.YYYYMMDD` suffix for the `dev` branch.
3. **Dependency Installation:**
- Sets up Python 3.11 and installs project dependencies, including development and reinforcement learning extras.
4. **Benchmark Execution:**
- Runs the benchmarking script and stores results in a version-based directory structure.
- Generates detailed markdown reports for benchmark results.
5. **Commit and Push Results:**
- Commits and pushes benchmark results for release branches.
6. **Artifact Preparation and Publishing:**
- Packages benchmark results into a tarball and publishes it as a pipeline artifact.
## Checklist
- [X] PR is linked to a **work item**
- [X] **acceptance criteria** of linked ticket are met
- [X] performed **self-review** of the code
- [ ] written **tests** for any new functionality added with this PR
- [ ] updated the **documentation** if this PR changes or adds functionality
- [ ] written/updated **design docs** if this PR implements new functionality
- [ ] updated the **change log**
- [X] ran **pre-commit** checks for code style
- [ ] attended to any **TO-DOs** left in the code
Related work items: #2648
This commit is contained in:
84
.azure/azure-benchmark-pipeline.yaml
Normal file
84
.azure/azure-benchmark-pipeline.yaml
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
trigger:
|
||||||
|
branches:
|
||||||
|
exclude:
|
||||||
|
- '*'
|
||||||
|
include:
|
||||||
|
- 'refs/heads/release/*'
|
||||||
|
|
||||||
|
schedules:
|
||||||
|
- cron: "0 2 * * 1-5" # Run at 2 AM every weekday
|
||||||
|
displayName: "Weekday Schedule"
|
||||||
|
branches:
|
||||||
|
include:
|
||||||
|
- 'refs/heads/dev'
|
||||||
|
|
||||||
|
pool:
|
||||||
|
vmImage: ubuntu-latest
|
||||||
|
|
||||||
|
variables:
|
||||||
|
VERSION: ''
|
||||||
|
MAJOR_VERSION: ''
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- checkout: self
|
||||||
|
persistCredentials: true
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
VERSION=$(cat src/primaite/VERSION | tr -d '\n')
|
||||||
|
if [[ "$(Build.SourceBranch)" == "refs/heads/dev" ]]; then
|
||||||
|
DATE=$(date +%Y%m%d)
|
||||||
|
echo "${VERSION}+dev.${DATE}" > src/primaite/VERSION
|
||||||
|
fi
|
||||||
|
displayName: 'Update VERSION file for Dev Benchmark'
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
VERSION=$(cat src/primaite/VERSION | tr -d '\n')
|
||||||
|
MAJOR_VERSION=$(echo $VERSION | cut -d. -f1)
|
||||||
|
echo "##vso[task.setvariable variable=VERSION]$VERSION"
|
||||||
|
echo "##vso[task.setvariable variable=MAJOR_VERSION]$MAJOR_VERSION"
|
||||||
|
displayName: 'Set Version Variables'
|
||||||
|
|
||||||
|
- task: UsePythonVersion@0
|
||||||
|
inputs:
|
||||||
|
versionSpec: '3.11'
|
||||||
|
addToPath: true
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -e .[dev,rl]
|
||||||
|
primaite setup
|
||||||
|
displayName: 'Install Dependencies'
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
cd benchmark
|
||||||
|
python3 primaite_benchmark.py
|
||||||
|
cd ..
|
||||||
|
displayName: 'Run Benchmarking Script'
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
git config --global user.email "oss@dstl.gov.uk"
|
||||||
|
git config --global user.name "Defence Science and Technology Laboratory UK"
|
||||||
|
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||||
|
displayName: 'Configure Git'
|
||||||
|
condition: and(succeeded(), eq(variables['Build.Reason'], 'Manual'), startsWith(variables['Build.SourceBranch'], 'refs/heads/release'))
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
git add benchmark/results/v$(MAJOR_VERSION)/v$(VERSION)/*
|
||||||
|
git commit -m "Automated benchmark output commit for version $(VERSION)"
|
||||||
|
git push origin HEAD:refs/heads/$(Build.SourceBranchName)
|
||||||
|
displayName: 'Commit and Push Benchmark Results'
|
||||||
|
workingDirectory: $(System.DefaultWorkingDirectory)
|
||||||
|
env:
|
||||||
|
GIT_CREDENTIALS: $(System.AccessToken)
|
||||||
|
condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/heads/release'))
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
tar czf primaite_v$(VERSION)_benchmark.tar.gz benchmark/results/v$(MAJOR_VERSION)/v$(VERSION)
|
||||||
|
displayName: 'Prepare Artifacts for Publishing'
|
||||||
|
|
||||||
|
- task: PublishPipelineArtifact@1
|
||||||
|
inputs:
|
||||||
|
targetPath: primaite_v$(VERSION)_benchmark.tar.gz
|
||||||
|
artifactName: 'benchmark-output'
|
||||||
|
publishLocation: 'pipeline'
|
||||||
|
displayName: 'Publish Benchmark Output as Artifact'
|
||||||
@@ -117,14 +117,14 @@ class BenchmarkSession:
|
|||||||
def generate_learn_metadata_dict(self) -> Dict[str, Any]:
|
def generate_learn_metadata_dict(self) -> Dict[str, Any]:
|
||||||
"""Metadata specific to the learning session."""
|
"""Metadata specific to the learning session."""
|
||||||
total_s, s_per_step, s_per_100_steps_10_nodes = self._learn_benchmark_durations()
|
total_s, s_per_step, s_per_100_steps_10_nodes = self._learn_benchmark_durations()
|
||||||
self.gym_env.average_reward_per_episode.pop(0) # remove episode 0
|
self.gym_env.total_reward_per_episode.pop(0) # remove episode 0
|
||||||
return {
|
return {
|
||||||
"total_episodes": self.gym_env.episode_counter,
|
"total_episodes": self.gym_env.episode_counter,
|
||||||
"total_time_steps": self.gym_env.total_time_steps,
|
"total_time_steps": self.gym_env.total_time_steps,
|
||||||
"total_s": total_s,
|
"total_s": total_s,
|
||||||
"s_per_step": s_per_step,
|
"s_per_step": s_per_step,
|
||||||
"s_per_100_steps_10_nodes": s_per_100_steps_10_nodes,
|
"s_per_100_steps_10_nodes": s_per_100_steps_10_nodes,
|
||||||
"av_reward_per_episode": self.gym_env.average_reward_per_episode,
|
"total_reward_per_episode": self.gym_env.total_reward_per_episode,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -9,10 +9,6 @@ import plotly.graph_objects as go
|
|||||||
import polars as pl
|
import polars as pl
|
||||||
import yaml
|
import yaml
|
||||||
from plotly.graph_objs import Figure
|
from plotly.graph_objs import Figure
|
||||||
from pylatex import Command, Document
|
|
||||||
from pylatex import Figure as LatexFigure
|
|
||||||
from pylatex import Section, Subsection, Tabular
|
|
||||||
from pylatex.utils import bold
|
|
||||||
from utils import _get_system_info
|
from utils import _get_system_info
|
||||||
|
|
||||||
import primaite
|
import primaite
|
||||||
@@ -39,19 +35,19 @@ def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict,
|
|||||||
"av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / num_sessions,
|
"av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / num_sessions,
|
||||||
"av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values())
|
"av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values())
|
||||||
/ num_sessions,
|
/ num_sessions,
|
||||||
"combined_av_reward_per_episode": {},
|
"combined_total_reward_per_episode": {},
|
||||||
"session_av_reward_per_episode": {k: v["av_reward_per_episode"] for k, v in metadata_dict.items()},
|
"session_total_reward_per_episode": {k: v["total_reward_per_episode"] for k, v in metadata_dict.items()},
|
||||||
"config": config,
|
"config": config,
|
||||||
}
|
}
|
||||||
|
|
||||||
# find the average of each episode across all sessions
|
# find the average of each episode across all sessions
|
||||||
episodes = metadata_dict[1]["av_reward_per_episode"].keys()
|
episodes = metadata_dict[1]["total_reward_per_episode"].keys()
|
||||||
|
|
||||||
for episode in episodes:
|
for episode in episodes:
|
||||||
combined_av_reward = (
|
combined_av_reward = (
|
||||||
sum(metadata_dict[k]["av_reward_per_episode"][episode] for k in metadata_dict.keys()) / num_sessions
|
sum(metadata_dict[k]["total_reward_per_episode"][episode] for k in metadata_dict.keys()) / num_sessions
|
||||||
)
|
)
|
||||||
averaged_data["combined_av_reward_per_episode"][episode] = combined_av_reward
|
averaged_data["combined_total_reward_per_episode"][episode] = combined_av_reward
|
||||||
|
|
||||||
return averaged_data
|
return averaged_data
|
||||||
|
|
||||||
@@ -87,7 +83,7 @@ def _plot_benchmark_metadata(
|
|||||||
fig = go.Figure(layout=layout)
|
fig = go.Figure(layout=layout)
|
||||||
fig.update_layout(template=PLOT_CONFIG["template"])
|
fig.update_layout(template=PLOT_CONFIG["template"])
|
||||||
|
|
||||||
for session, av_reward_dict in benchmark_metadata_dict["session_av_reward_per_episode"].items():
|
for session, av_reward_dict in benchmark_metadata_dict["session_total_reward_per_episode"].items():
|
||||||
df = _get_df_from_episode_av_reward_dict(av_reward_dict)
|
df = _get_df_from_episode_av_reward_dict(av_reward_dict)
|
||||||
fig.add_trace(
|
fig.add_trace(
|
||||||
go.Scatter(
|
go.Scatter(
|
||||||
@@ -100,7 +96,7 @@ def _plot_benchmark_metadata(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_av_reward_per_episode"])
|
df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_total_reward_per_episode"])
|
||||||
fig.add_trace(
|
fig.add_trace(
|
||||||
go.Scatter(
|
go.Scatter(
|
||||||
x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"}
|
x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"}
|
||||||
@@ -136,11 +132,11 @@ def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
|
|||||||
|
|
||||||
Does this by iterating over the ``benchmark/results`` directory and
|
Does this by iterating over the ``benchmark/results`` directory and
|
||||||
extracting the benchmark metadata json for each version that has been
|
extracting the benchmark metadata json for each version that has been
|
||||||
benchmarked. The combined_av_reward_per_episode is extracted from each,
|
benchmarked. The combined_total_reward_per_episode is extracted from each,
|
||||||
converted into a polars dataframe, and plotted as a scatter line in plotly.
|
converted into a polars dataframe, and plotted as a scatter line in plotly.
|
||||||
"""
|
"""
|
||||||
major_v = primaite.__version__.split(".")[0]
|
major_v = primaite.__version__.split(".")[0]
|
||||||
title = f"Learning Benchmarking of All Released Versions under Major v{major_v}.*.*"
|
title = f"Learning Benchmarking of All Released Versions under Major v{major_v}.#.#"
|
||||||
subtitle = "Rolling Av (Combined Session Av)"
|
subtitle = "Rolling Av (Combined Session Av)"
|
||||||
if title:
|
if title:
|
||||||
if subtitle:
|
if subtitle:
|
||||||
@@ -162,7 +158,7 @@ def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
|
|||||||
metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
|
metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
|
||||||
with open(metadata_file, "r") as file:
|
with open(metadata_file, "r") as file:
|
||||||
metadata_dict = json.load(file)
|
metadata_dict = json.load(file)
|
||||||
df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_av_reward_per_episode"])
|
df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_total_reward_per_episode"])
|
||||||
|
|
||||||
fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name))
|
fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name))
|
||||||
|
|
||||||
@@ -208,98 +204,77 @@ def build_benchmark_latex_report(
|
|||||||
|
|
||||||
fig = _plot_all_benchmarks_combined_session_av(results_directory=results_root_path)
|
fig = _plot_all_benchmarks_combined_session_av(results_directory=results_root_path)
|
||||||
|
|
||||||
all_version_plot_path = results_root_path / "PrimAITE Versions Learning Benchmark.png"
|
all_version_plot_path = version_result_dir / "PrimAITE Versions Learning Benchmark.png"
|
||||||
fig.write_image(all_version_plot_path)
|
fig.write_image(all_version_plot_path)
|
||||||
|
|
||||||
geometry_options = {"tmargin": "2.5cm", "rmargin": "2.5cm", "bmargin": "2.5cm", "lmargin": "2.5cm"}
|
|
||||||
data = benchmark_metadata_dict
|
data = benchmark_metadata_dict
|
||||||
primaite_version = data["primaite_version"]
|
primaite_version = data["primaite_version"]
|
||||||
|
|
||||||
# Create a new document
|
with open(version_result_dir / f"PrimAITE v{primaite_version} Learning Benchmark.md", "w") as file:
|
||||||
doc = Document("report", geometry_options=geometry_options)
|
# Title
|
||||||
# Title
|
file.write(f"# PrimAITE v{primaite_version} Learning Benchmark\n")
|
||||||
doc.preamble.append(Command("title", f"PrimAITE {primaite_version} Learning Benchmark"))
|
file.write("## PrimAITE Dev Team\n")
|
||||||
doc.preamble.append(Command("author", "PrimAITE Dev Team"))
|
file.write(f"### {datetime.now().date()}\n")
|
||||||
doc.preamble.append(Command("date", datetime.now().date()))
|
file.write("\n---\n")
|
||||||
doc.append(Command("maketitle"))
|
|
||||||
|
|
||||||
sessions = data["total_sessions"]
|
sessions = data["total_sessions"]
|
||||||
episodes = session_metadata[1]["total_episodes"] - 1
|
episodes = session_metadata[1]["total_episodes"] - 1
|
||||||
steps = data["config"]["game"]["max_episode_length"]
|
steps = data["config"]["game"]["max_episode_length"]
|
||||||
|
|
||||||
# Body
|
# Body
|
||||||
with doc.create(Section("Introduction")):
|
file.write("## 1 Introduction\n")
|
||||||
doc.append(
|
file.write(
|
||||||
f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics "
|
f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics "
|
||||||
f"were captured to be referenced during system-level testing and user acceptance testing (UAT)."
|
f"were captured to be referenced during system-level testing and user acceptance testing (UAT).\n"
|
||||||
)
|
)
|
||||||
doc.append(
|
file.write(
|
||||||
f"\nThe benchmarking process consists of running {sessions} training session using the same "
|
f"The benchmarking process consists of running {sessions} training session using the same "
|
||||||
f"config file. Each session trains an agent for {episodes} episodes, "
|
f"config file. Each session trains an agent for {episodes} episodes, "
|
||||||
f"with each episode consisting of {steps} steps."
|
f"with each episode consisting of {steps} steps.\n"
|
||||||
)
|
)
|
||||||
doc.append(
|
file.write(
|
||||||
f"\nThe total reward per episode from each session is captured. This is then used to calculate an "
|
f"The total reward per episode from each session is captured. This is then used to calculate an "
|
||||||
f"caverage total reward per episode from the {sessions} individual sessions for smoothing. "
|
f"caverage total reward per episode from the {sessions} individual sessions for smoothing. "
|
||||||
f"Finally, a 25-widow rolling average of the average total reward per session is calculated for "
|
f"Finally, a 25-widow rolling average of the average total reward per session is calculated for "
|
||||||
f"further smoothing."
|
f"further smoothing.\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
with doc.create(Section("System Information")):
|
file.write("## 2 System Information\n")
|
||||||
with doc.create(Subsection("Python")):
|
i = 1
|
||||||
with doc.create(Tabular("|l|l|")) as table:
|
file.write(f"### 2.{i} Python\n")
|
||||||
table.add_hline()
|
file.write(f"**Version:** {sys.version}\n")
|
||||||
table.add_row((bold("Version"), sys.version))
|
|
||||||
table.add_hline()
|
|
||||||
for section, section_data in data["system_info"].items():
|
for section, section_data in data["system_info"].items():
|
||||||
|
i += 1
|
||||||
if section_data:
|
if section_data:
|
||||||
with doc.create(Subsection(section)):
|
file.write(f"### 2.{i} {section}\n")
|
||||||
if isinstance(section_data, dict):
|
if isinstance(section_data, dict):
|
||||||
with doc.create(Tabular("|l|l|")) as table:
|
for key, value in section_data.items():
|
||||||
table.add_hline()
|
file.write(f"- **{key}:** {value}\n")
|
||||||
for key, value in section_data.items():
|
|
||||||
table.add_row((bold(key), value))
|
|
||||||
table.add_hline()
|
|
||||||
elif isinstance(section_data, list):
|
|
||||||
headers = section_data[0].keys()
|
|
||||||
tabs_str = "|".join(["l" for _ in range(len(headers))])
|
|
||||||
tabs_str = f"|{tabs_str}|"
|
|
||||||
with doc.create(Tabular(tabs_str)) as table:
|
|
||||||
table.add_hline()
|
|
||||||
table.add_row([bold(h) for h in headers])
|
|
||||||
table.add_hline()
|
|
||||||
for item in section_data:
|
|
||||||
table.add_row(item.values())
|
|
||||||
table.add_hline()
|
|
||||||
|
|
||||||
headers_map = {
|
headers_map = {
|
||||||
"total_sessions": "Total Sessions",
|
"total_sessions": "Total Sessions",
|
||||||
"total_episodes": "Total Episodes",
|
"total_episodes": "Total Episodes",
|
||||||
"total_time_steps": "Total Steps",
|
"total_time_steps": "Total Steps",
|
||||||
"av_s_per_session": "Av Session Duration (s)",
|
"av_s_per_session": "Av Session Duration (s)",
|
||||||
"av_s_per_step": "Av Step Duration (s)",
|
"av_s_per_step": "Av Step Duration (s)",
|
||||||
"av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)",
|
"av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)",
|
||||||
}
|
}
|
||||||
with doc.create(Section("Stats")):
|
|
||||||
with doc.create(Subsection("Benchmark Results")):
|
|
||||||
with doc.create(Tabular("|l|l|")) as table:
|
|
||||||
table.add_hline()
|
|
||||||
for section, header in headers_map.items():
|
|
||||||
if section.startswith("av_"):
|
|
||||||
table.add_row((bold(header), f"{data[section]:.4f}"))
|
|
||||||
else:
|
|
||||||
table.add_row((bold(header), data[section]))
|
|
||||||
table.add_hline()
|
|
||||||
|
|
||||||
with doc.create(Section("Graphs")):
|
file.write("## 3 Stats\n")
|
||||||
with doc.create(Subsection(f"v{primaite_version} Learning Benchmark Plot")):
|
for section, header in headers_map.items():
|
||||||
with doc.create(LatexFigure(position="h!")) as pic:
|
if section.startswith("av_"):
|
||||||
pic.add_image(str(this_version_plot_path))
|
file.write(f"- **{header}:** {data[section]:.4f}\n")
|
||||||
pic.add_caption(f"PrimAITE {primaite_version} Learning Benchmark Plot")
|
else:
|
||||||
|
file.write(f"- **{header}:** {data[section]}\n")
|
||||||
|
|
||||||
with doc.create(Subsection(f"Learning Benchmarking of All Released Versions under Major v{major_v}.*.*")):
|
file.write("## 4 Graphs\n")
|
||||||
with doc.create(LatexFigure(position="h!")) as pic:
|
|
||||||
pic.add_image(str(all_version_plot_path))
|
|
||||||
pic.add_caption(f"Learning Benchmarking of All Released Versions under Major v{major_v}.*.*")
|
|
||||||
|
|
||||||
doc.generate_pdf(str(this_version_plot_path).replace(".png", ""), clean_tex=True)
|
file.write(f"### 4.1 v{primaite_version} Learning Benchmark Plot\n")
|
||||||
|
file.write(f"\n")
|
||||||
|
|
||||||
|
file.write(f"### 4.2 Learning Benchmarking of All Released Versions under Major v{major_v}.#.#\n")
|
||||||
|
file.write(
|
||||||
|
f"\n"
|
||||||
|
)
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ dev = [
|
|||||||
"gputil==1.4.0",
|
"gputil==1.4.0",
|
||||||
"pip-licenses==4.3.0",
|
"pip-licenses==4.3.0",
|
||||||
"pre-commit==2.20.0",
|
"pre-commit==2.20.0",
|
||||||
"pylatex==1.4.1",
|
|
||||||
"pytest==7.2.0",
|
"pytest==7.2.0",
|
||||||
"pytest-xdist==3.3.1",
|
"pytest-xdist==3.3.1",
|
||||||
"pytest-cov==4.0.0",
|
"pytest-cov==4.0.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user