diff --git a/.gitignore b/.gitignore
index c3d54ada..4bb125da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,6 @@ src/primaite/notebooks/scratch.py
 sandbox.py
 sandbox/
 sandbox.ipynb
+
+# benchmarking
+**/benchmark_session/
diff --git a/benchmark/primaite_benchmark.py b/benchmark/primaite_benchmark.py
index 226bb71e..c68d5a68 100644
--- a/benchmark/primaite_benchmark.py
+++ b/benchmark/primaite_benchmark.py
@@ -1,37 +1,19 @@
-# flake8: noqa
-raise DeprecationWarning(
-    "Benchmarking depends on deprecated functionality and it has not been updated to primaite v3 yet."
-)
 # © Crown-owned copyright 2023, Defence Science and Technology Laboratory UK
-import json
-import platform
 import shutil
-import sys
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, Final, Optional, Tuple, Union
-from unittest.mock import patch
+from typing import Any, Dict, Final, Tuple
 
-import GPUtil
-import plotly.graph_objects as go
-import polars as pl
-import psutil
-import yaml
-from plotly.graph_objs import Figure
-from pylatex import Command, Document
-from pylatex import Figure as LatexFigure
-from pylatex import Section, Subsection, Tabular
-from pylatex.utils import bold
+from stable_baselines3 import PPO
 
 import primaite
-from primaite.config.lay_down_config import data_manipulation_config_path
-from primaite.data_viz.session_plots import get_plotly_config
-from primaite.environment.primaite_env import Primaite
-from primaite.primaite_session import PrimaiteSession
+from benchmark.utils.benchmark import BenchmarkPrimaiteGymEnv
+from benchmark.utils.report import build_benchmark_latex_report
+from primaite.config.load import data_manipulation_config_path
 
 _LOGGER = primaite.getLogger(__name__)
 
-_BENCHMARK_ROOT = Path(__file__).parent
+_BENCHMARK_ROOT = Path(__file__).parent / "benchmark_session"
 _RESULTS_ROOT: Final[Path] = _BENCHMARK_ROOT / "results"
 _RESULTS_ROOT.mkdir(exist_ok=True, parents=True)
 
@@ -41,171 +23,51 @@ if _OUTPUT_ROOT.exists():
     shutil.rmtree(_OUTPUT_ROOT)
 _OUTPUT_ROOT.mkdir()
 
-_TRAINING_CONFIG_PATH = _BENCHMARK_ROOT / "config" / "benchmark_training_config.yaml"
-_LAY_DOWN_CONFIG_PATH = data_manipulation_config_path()
 
+class BenchmarkSession:
+    """Benchmark Session class."""
 
-def get_size(size_bytes: int) -> str:
-    """
-    Scale bytes to its proper format.
+    gym_env: BenchmarkPrimaiteGymEnv
+    """Gym environment used by the session to train."""
 
-    e.g:
-        1253656 => '1.20MB'
-        1253656678 => '1.17GB'
+    num_episodes: int
+    """Number of episodes to run the training session."""
 
-    :
-    """
-    factor = 1024
-    for unit in ["", "K", "M", "G", "T", "P"]:
-        if size_bytes < factor:
-            return f"{size_bytes:.2f}{unit}B"
-        size_bytes /= factor
+    batch_size: int
+    """Number of steps for each episode."""
 
+    start_time: datetime
+    """Start time for the session."""
 
-def _get_system_info() -> Dict:
-    """Builds and returns a dict containing system info."""
-    uname = platform.uname()
-    cpu_freq = psutil.cpu_freq()
-    virtual_mem = psutil.virtual_memory()
-    swap_mem = psutil.swap_memory()
-    gpus = GPUtil.getGPUs()
-    return {
-        "System": {
-            "OS": uname.system,
-            "OS Version": uname.version,
-            "Machine": uname.machine,
-            "Processor": uname.processor,
-        },
-        "CPU": {
-            "Physical Cores": psutil.cpu_count(logical=False),
-            "Total Cores": psutil.cpu_count(logical=True),
-            "Max Frequency": f"{cpu_freq.max:.2f}Mhz",
-        },
-        "Memory": {"Total": get_size(virtual_mem.total), "Swap Total": get_size(swap_mem.total)},
-        "GPU": [{"Name": gpu.name, "Total Memory": f"{gpu.memoryTotal}MB"} for gpu in gpus],
-    }
+    end_time: datetime
+    """End time for the session."""
 
+    session_metadata: Dict
+    """Dict containing the metadata for the session - used to generate benchmark report."""
 
-def _build_benchmark_latex_report(
-    benchmark_metadata_dict: Dict, this_version_plot_path: Path, all_version_plot_path: Path
-) -> None:
-    geometry_options = {"tmargin": "2.5cm", "rmargin": "2.5cm", "bmargin": "2.5cm", "lmargin": "2.5cm"}
-    data = benchmark_metadata_dict
-    primaite_version = data["primaite_version"]
+    def __init__(self, gym_env: BenchmarkPrimaiteGymEnv, num_episodes: int, batch_size: int):
+        """Initialise the BenchmarkSession."""
+        self.gym_env = gym_env
+        self.num_episodes = num_episodes
+        self.batch_size = batch_size
 
-    # Create a new document
-    doc = Document("report", geometry_options=geometry_options)
-    # Title
-    doc.preamble.append(Command("title", f"PrimAITE {primaite_version} Learning Benchmark"))
-    doc.preamble.append(Command("author", "PrimAITE Dev Team"))
-    doc.preamble.append(Command("date", datetime.now().date()))
-    doc.append(Command("maketitle"))
+    def train(self):
+        """Run the training session."""
+        # start timer for session
+        self.start_time = datetime.now()
 
-    sessions = data["total_sessions"]
-    episodes = data["training_config"]["num_train_episodes"]
-    steps = data["training_config"]["num_train_steps"]
-
-    # Body
-    with doc.create(Section("Introduction")):
-        doc.append(
-            f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics "
-            f"were captured to be referenced during system-level testing and user acceptance testing (UAT)."
-        )
-        doc.append(
-            f"\nThe benchmarking process consists of running {sessions} training session using the same "
-            f"training and lay down config files. Each session trains an agent for {episodes} episodes, "
-            f"with each episode consisting of {steps} steps."
-        )
-        doc.append(
-            f"\nThe mean reward per episode from each session is captured. This is then used to calculate a "
-            f"combined average reward per episode from the {sessions} individual sessions for smoothing. "
-            f"Finally, a 25-widow rolling average of the combined average reward per session is calculated for "
-            f"further smoothing."
+        model = PPO(
+            policy="MlpPolicy",
+            env=self.gym_env,
+            batch_size=self.batch_size,
+            n_steps=self.batch_size * self.num_episodes,
         )
+        model.learn(total_timesteps=self.batch_size * self.num_episodes)
 
-    with doc.create(Section("System Information")):
-        with doc.create(Subsection("Python")):
-            with doc.create(Tabular("|l|l|")) as table:
-                table.add_hline()
-                table.add_row((bold("Version"), sys.version))
-                table.add_hline()
-        for section, section_data in data["system_info"].items():
-            if section_data:
-                with doc.create(Subsection(section)):
-                    if isinstance(section_data, dict):
-                        with doc.create(Tabular("|l|l|")) as table:
-                            table.add_hline()
-                            for key, value in section_data.items():
-                                table.add_row((bold(key), value))
-                                table.add_hline()
-                    elif isinstance(section_data, list):
-                        headers = section_data[0].keys()
-                        tabs_str = "|".join(["l" for _ in range(len(headers))])
-                        tabs_str = f"|{tabs_str}|"
-                        with doc.create(Tabular(tabs_str)) as table:
-                            table.add_hline()
-                            table.add_row([bold(h) for h in headers])
-                            table.add_hline()
-                            for item in section_data:
-                                table.add_row(item.values())
-                                table.add_hline()
+        # end timer for session
+        self.end_time = datetime.now()
 
-    headers_map = {
-        "total_sessions": "Total Sessions",
-        "total_episodes": "Total Episodes",
-        "total_time_steps": "Total Steps",
-        "av_s_per_session": "Av Session Duration (s)",
-        "av_s_per_step": "Av Step Duration (s)",
-        "av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)",
-    }
-    with doc.create(Section("Stats")):
-        with doc.create(Subsection("Benchmark Results")):
-            with doc.create(Tabular("|l|l|")) as table:
-                table.add_hline()
-                for section, header in headers_map.items():
-                    if section.startswith("av_"):
-                        table.add_row((bold(header), f"{data[section]:.4f}"))
-                    else:
-                        table.add_row((bold(header), data[section]))
-                    table.add_hline()
-
-    with doc.create(Section("Graphs")):
-        with doc.create(Subsection(f"PrimAITE {primaite_version} Learning Benchmark Plot")):
-            with doc.create(LatexFigure(position="h!")) as pic:
-                pic.add_image(str(this_version_plot_path))
-                pic.add_caption(f"PrimAITE {primaite_version} Learning Benchmark Plot")
-
-        with doc.create(Subsection("PrimAITE All Versions Learning Benchmark Plot")):
-            with doc.create(LatexFigure(position="h!")) as pic:
-                pic.add_image(str(all_version_plot_path))
-                pic.add_caption("PrimAITE All Versions Learning Benchmark Plot")
-
-    doc.generate_pdf(str(this_version_plot_path).replace(".png", ""), clean_tex=True)
-
-
-class BenchmarkPrimaiteSession(PrimaiteSession):
-    """A benchmarking primaite session."""
-
-    def __init__(
-        self,
-        training_config_path: Union[str, Path],
-        lay_down_config_path: Union[str, Path],
-    ) -> None:
-        super().__init__(training_config_path, lay_down_config_path)
-        self.setup()
-
-    @property
-    def env(self) -> Primaite:
-        """Direct access to the env for ease of testing."""
-        return self._agent_session._env  # noqa
-
-    def __enter__(self) -> "BenchmarkPrimaiteSession":
-        return self
-
-    # TODO: typehints uncertain
-    def __exit__(self, type: Any, value: Any, tb: Any) -> None:
-        shutil.rmtree(self.session_path)
-        _LOGGER.debug(f"Deleted benchmark session directory: {self.session_path}")
+        self.session_metadata = self.generate_learn_metadata_dict()
 
     def _learn_benchmark_durations(self) -> Tuple[float, float, float]:
         """
@@ -219,235 +81,78 @@ class BenchmarkPrimaiteSession(PrimaiteSession):
         :return: The learning benchmark durations as a Tuple of three floats:
             Tuple[total_s, s_per_step, s_per_100_steps_10_nodes].
         """
-        data = self.metadata_file_as_dict()
-        start_dt = datetime.fromisoformat(data["start_datetime"])
-        end_dt = datetime.fromisoformat(data["end_datetime"])
-        delta = end_dt - start_dt
+        delta = self.end_time - self.start_time
         total_s = delta.total_seconds()
 
-        total_steps = data["learning"]["total_time_steps"]
+        total_steps = self.batch_size * self.num_episodes
         s_per_step = total_s / total_steps
 
-        num_nodes = self.env.num_nodes
+        num_nodes = len(self.gym_env.game.simulation.network.nodes)
         num_intervals = total_steps / 100
         av_interval_time = total_s / num_intervals
         s_per_100_steps_10_nodes = av_interval_time / (num_nodes / 10)
 
         return total_s, s_per_step, s_per_100_steps_10_nodes
 
-    def learn_metadata_dict(self) -> Dict[str, Any]:
+    def generate_learn_metadata_dict(self) -> Dict[str, Any]:
         """Metadata specific to the learning session."""
         total_s, s_per_step, s_per_100_steps_10_nodes = self._learn_benchmark_durations()
+        self.gym_env.average_reward_per_episode.pop(0)  # remove episode 0
         return {
-            "total_episodes": self.env.actual_episode_count,
-            "total_time_steps": self.env.total_step_count,
+            "total_episodes": self.gym_env.episode_counter,
+            "total_time_steps": self.gym_env.total_time_steps,
             "total_s": total_s,
             "s_per_step": s_per_step,
             "s_per_100_steps_10_nodes": s_per_100_steps_10_nodes,
-            "av_reward_per_episode": self.learn_av_reward_per_episode_dict(),
+            "av_reward_per_episode": self.gym_env.average_reward_per_episode,
         }
 
 
-def _get_benchmark_session_path(session_timestamp: datetime) -> Path:
-    return _OUTPUT_ROOT / session_timestamp.strftime("%Y-%m-%d_%H-%M-%S")
-
-
-def _get_benchmark_primaite_session() -> BenchmarkPrimaiteSession:
-    with patch("primaite.agents.agent_abc.get_session_path", _get_benchmark_session_path) as mck:
-        mck.session_timestamp = datetime.now()
-        return BenchmarkPrimaiteSession(_TRAINING_CONFIG_PATH, _LAY_DOWN_CONFIG_PATH)
-
-
-def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict) -> dict:
-    n = len(metadata_dict)
-    with open(_TRAINING_CONFIG_PATH, "r") as file:
-        training_config_dict = yaml.safe_load(file)
-    with open(_LAY_DOWN_CONFIG_PATH, "r") as file:
-        lay_down_config_dict = yaml.safe_load(file)
-    averaged_data = {
-        "start_timestamp": start_datetime.isoformat(),
-        "end_datetime": datetime.now().isoformat(),
-        "primaite_version": primaite.__version__,
-        "system_info": _get_system_info(),
-        "total_sessions": n,
-        "total_episodes": sum(d["total_episodes"] for d in metadata_dict.values()),
-        "total_time_steps": sum(d["total_time_steps"] for d in metadata_dict.values()),
-        "av_s_per_session": sum(d["total_s"] for d in metadata_dict.values()) / n,
-        "av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / n,
-        "av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values()) / n,
-        "combined_av_reward_per_episode": {},
-        "session_av_reward_per_episode": {k: v["av_reward_per_episode"] for k, v in metadata_dict.items()},
-        "training_config": training_config_dict,
-        "lay_down_config": lay_down_config_dict,
-    }
-
-    episodes = metadata_dict[1]["av_reward_per_episode"].keys()
-
-    for episode in episodes:
-        combined_av_reward = sum(metadata_dict[k]["av_reward_per_episode"][episode] for k in metadata_dict.keys()) / n
-        averaged_data["combined_av_reward_per_episode"][episode] = combined_av_reward
-
-    return averaged_data
-
-
-def _get_df_from_episode_av_reward_dict(data: Dict) -> pl.DataFrame:
-    data: Dict = {"episode": data.keys(), "av_reward": data.values()}
-
-    return (
-        pl.from_dict(data)
-        .with_columns(rolling_mean=pl.col("av_reward").rolling_mean(window_size=25))
-        .rename({"rolling_mean": "rolling_av_reward"})
-    )
-
-
-def _plot_benchmark_metadata(
-    benchmark_metadata_dict: Dict,
-    title: Optional[str] = None,
-    subtitle: Optional[str] = None,
-) -> Figure:
-    if title:
-        if subtitle:
-            title = f"{title} <br>{subtitle}</sup>"
-    else:
-        if subtitle:
-            title = subtitle
-
-    config = get_plotly_config()
-    layout = go.Layout(
-        autosize=config["size"]["auto_size"],
-        width=config["size"]["width"],
-        height=config["size"]["height"],
-    )
-    # Create the line graph with a colored line
-    fig = go.Figure(layout=layout)
-    fig.update_layout(template=config["template"])
-
-    for session, av_reward_dict in benchmark_metadata_dict["session_av_reward_per_episode"].items():
-        df = _get_df_from_episode_av_reward_dict(av_reward_dict)
-        fig.add_trace(
-            go.Scatter(
-                x=df["episode"],
-                y=df["av_reward"],
-                mode="lines",
-                name=f"Session {session}",
-                opacity=0.25,
-                line={"color": "#a6a6a6"},
-            )
-        )
-
-    df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_av_reward_per_episode"])
-    fig.add_trace(
-        go.Scatter(
-            x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"}
-        )
-    )
-
-    fig.add_trace(
-        go.Scatter(
-            x=df["episode"],
-            y=df["rolling_av_reward"],
-            mode="lines",
-            name="Rolling Av (Combined Session Av)",
-            line={"color": "#4CBB17"},
-        )
-    )
-
-    # Set the layout of the graph
-    fig.update_layout(
-        xaxis={
-            "title": "Episode",
-            "type": "linear",
-        },
-        yaxis={"title": "Average Reward"},
-        title=title,
-    )
-
-    return fig
-
-
-def _plot_all_benchmarks_combined_session_av() -> Figure:
+def _get_benchmark_primaite_environment() -> BenchmarkPrimaiteGymEnv:
     """
-    Plot the Benchmark results for each released version of PrimAITE.
+    Create an instance of the BenchmarkPrimaiteGymEnv.
 
-    Does this by iterating over the ``benchmark/results`` directory and
-    extracting the benchmark metadata json for each version that has been
-    benchmarked. The combined_av_reward_per_episode is extracted from each,
-    converted into a polars dataframe, and plotted as a scatter line in plotly.
+    This environment will be used to train the agents on.
     """
-    title = "PrimAITE Versions Learning Benchmark"
-    subtitle = "Rolling Av (Combined Session Av)"
-    if title:
-        if subtitle:
-            title = f"{title} <br>{subtitle}</sup>"
-    else:
-        if subtitle:
-            title = subtitle
-    config = get_plotly_config()
-    layout = go.Layout(
-        autosize=config["size"]["auto_size"],
-        width=config["size"]["width"],
-        height=config["size"]["height"],
-    )
-    # Create the line graph with a colored line
-    fig = go.Figure(layout=layout)
-    fig.update_layout(template=config["template"])
-
-    for dir in _RESULTS_ROOT.iterdir():
-        if dir.is_dir():
-            metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
-            with open(metadata_file, "r") as file:
-                metadata_dict = json.load(file)
-            df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_av_reward_per_episode"])
-
-            fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name))
-
-    # Set the layout of the graph
-    fig.update_layout(
-        xaxis={
-            "title": "Episode",
-            "type": "linear",
-        },
-        yaxis={"title": "Average Reward"},
-        title=title,
-    )
-    fig["data"][0]["showlegend"] = True
-
-    return fig
+    return BenchmarkPrimaiteGymEnv(env_config=data_manipulation_config_path())
 
 
-def run() -> None:
+def _prepare_session_directory():
+    """Prepare the session directory so that it is easier to clean up after the benchmarking is done."""
+    # override session path
+    session_path = _BENCHMARK_ROOT / "sessions"
+
+    if session_path.is_dir():
+        shutil.rmtree(session_path)
+
+    primaite.PRIMAITE_PATHS.user_sessions_path = session_path
+    primaite.PRIMAITE_PATHS.user_sessions_path.mkdir(exist_ok=True, parents=True)
+
+
+def run(number_of_sessions: int = 1, num_episodes: int = 3, batch_size: int = 128) -> None:  # 10  # 1000  # 256
     """Run the PrimAITE benchmark."""
-    start_datetime = datetime.now()
-    av_reward_per_episode_dicts = {}
-    for i in range(1, 11):
+    benchmark_start_time = datetime.now()
+
+    session_metadata_dict = {}
+
+    _prepare_session_directory()
+
+    # run training
+    for i in range(1, number_of_sessions + 1):
         print(f"Starting Benchmark Session: {i}")
-        with _get_benchmark_primaite_session() as session:
-            session.learn()
-            av_reward_per_episode_dicts[i] = session.learn_metadata_dict()
 
-    benchmark_metadata = _build_benchmark_results_dict(
-        start_datetime=start_datetime, metadata_dict=av_reward_per_episode_dicts
+        with _get_benchmark_primaite_environment() as gym_env:
+            session = BenchmarkSession(gym_env=gym_env, num_episodes=num_episodes, batch_size=batch_size)
+            session.train()
+            session_metadata_dict[i] = session.session_metadata
+
+    # generate report
+    build_benchmark_latex_report(
+        benchmark_start_time=benchmark_start_time,
+        session_metadata=session_metadata_dict,
+        config_path=data_manipulation_config_path(),
+        results_root_path=_RESULTS_ROOT,
     )
-    v_str = f"v{primaite.__version__}"
-
-    version_result_dir = _RESULTS_ROOT / v_str
-    if version_result_dir.exists():
-        shutil.rmtree(version_result_dir)
-    version_result_dir.mkdir(exist_ok=True, parents=True)
-
-    with open(version_result_dir / f"{v_str}_benchmark_metadata.json", "w") as file:
-        json.dump(benchmark_metadata, file, indent=4)
-    title = f"PrimAITE v{primaite.__version__.strip()} Learning Benchmark"
-    fig = _plot_benchmark_metadata(benchmark_metadata, title=title)
-    this_version_plot_path = version_result_dir / f"{title}.png"
-    fig.write_image(this_version_plot_path)
-
-    fig = _plot_all_benchmarks_combined_session_av()
-
-    all_version_plot_path = _RESULTS_ROOT / "PrimAITE Versions Learning Benchmark.png"
-    fig.write_image(all_version_plot_path)
-
-    _build_benchmark_latex_report(benchmark_metadata, this_version_plot_path, all_version_plot_path)
 
 
 if __name__ == "__main__":
diff --git a/benchmark/utils/benchmark.py b/benchmark/utils/benchmark.py
new file mode 100644
index 00000000..fc457a03
--- /dev/null
+++ b/benchmark/utils/benchmark.py
@@ -0,0 +1,122 @@
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+
+from gymnasium.core import ObsType
+
+from primaite.session.environment import PrimaiteGymEnv
+
+
+class BenchmarkPrimaiteGymEnv(PrimaiteGymEnv):
+    """
+    Class that extends the PrimaiteGymEnv.
+
+    The reset method is extended so that the average rewards per episode are recorded.
+    """
+
+    total_time_steps: int = 0
+
+    def reset(self, seed: Optional[int] = None) -> Tuple[ObsType, Dict[str, Any]]:
+        """Overrides the PrimAITEGymEnv reset so that the total timesteps is saved."""
+        self.total_time_steps += self.game.step_counter
+
+        return super().reset(seed=seed)
+
+
+#####################################
+# IGNORE BELOW FOR NOW
+#####################################
+
+
+class BenchMarkOSInfo:
+    """Operating System Information about the machine that run the benchmark."""
+
+    operating_system: str
+    """The operating system the benchmark was run on."""
+
+    operating_system_version: str
+    """The operating system version the benchmark was run on."""
+
+    machine: str
+    """The type of machine running the benchmark."""
+
+    processor: str
+    """The processor used to run the benchmark."""
+
+
+class BenchMarkCPUInfo:
+    """CPU Information of the machine that ran the benchmark."""
+
+    physical_cores: int
+    """The number of CPU cores the machine that ran the benchmark had."""
+
+    total_cores: int
+    """The number of total cores the machine that run the benchmark had."""
+
+    max_frequency: int
+    """The CPU's maximum clock speed."""
+
+
+class BenchMarkMemoryInfo:
+    """The Memory Information of the machine that ran the benchmark."""
+
+    total: str
+    """The total amount of memory."""
+
+    swap_total: str
+    """Virtual memory."""
+
+
+class BenchMarkGPUInfo:
+    """The GPU Information of the machine that ran the benchmark."""
+
+    name: str
+    """GPU name."""
+
+    total_memory: str
+    """GPU memory."""
+
+
+class BenchMarkSystemInfo:
+    """Overall system information of the machine that ran the benchmark."""
+
+    system: BenchMarkOSInfo
+    cpu: BenchMarkCPUInfo
+    memory: BenchMarkMemoryInfo
+    gpu: List[BenchMarkMemoryInfo]
+
+
+class BenchMarkResult:
+    """Class containing the relevant benchmark results."""
+
+    benchmark_start_time: datetime
+    """Start time of the benchmark run."""
+
+    benchmark_end_time: datetime
+    """End time of the benchmark run."""
+
+    primaite_version: str
+    """The version of PrimAITE being benchmarked."""
+
+    system_info: BenchMarkSystemInfo
+    """System information of the machine that ran the benchmark."""
+
+    total_sessions: int
+    """The number of sessions that the benchmark ran."""
+
+    total_episodes: int
+    """The number of episodes over all the sessions that the benchmark ran."""
+
+    total_timesteps: int
+    """The number of timesteps over all the sessions that the benchmark ran."""
+
+    average_seconds_per_session: float
+    """The average time per session."""
+
+    average_seconds_per_step: float
+    """The average time per step."""
+
+    average_seconds_per_100_steps_and_10_nodes: float
+    """The average time per 100 steps on a 10 node network."""
+
+    combined_average_reward_per_episode: Dict
+    """tbd."""
diff --git a/benchmark/utils/report.py b/benchmark/utils/report.py
new file mode 100644
index 00000000..0b509d37
--- /dev/null
+++ b/benchmark/utils/report.py
@@ -0,0 +1,304 @@
+# © Crown-owned copyright 2023, Defence Science and Technology Laboratory UK
+import json
+import shutil
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional
+
+import plotly.graph_objects as go
+import polars as pl
+import yaml
+from plotly.graph_objs import Figure
+from pylatex import Command, Document
+from pylatex import Figure as LatexFigure
+from pylatex import Section, Subsection, Tabular
+from pylatex.utils import bold
+
+import primaite
+from benchmark.utils.utils import _get_system_info
+
+PLOT_CONFIG = {
+    "size": {"auto_size": False, "width": 1500, "height": 900},
+    "template": "plotly_white",
+    "range_slider": False,
+}
+
+
+def _build_benchmark_results_dict(start_datetime: datetime, metadata_dict: Dict, config: Dict) -> dict:
+    n = len(metadata_dict)
+
+    averaged_data = {
+        "start_timestamp": start_datetime.isoformat(),
+        "end_datetime": datetime.now().isoformat(),
+        "primaite_version": primaite.__version__,
+        "system_info": _get_system_info(),
+        "total_sessions": n,
+        "total_episodes": sum(d["total_episodes"] for d in metadata_dict.values()),
+        "total_time_steps": sum(d["total_time_steps"] for d in metadata_dict.values()),
+        "av_s_per_session": sum(d["total_s"] for d in metadata_dict.values()) / n,
+        "av_s_per_step": sum(d["s_per_step"] for d in metadata_dict.values()) / n,
+        "av_s_per_100_steps_10_nodes": sum(d["s_per_100_steps_10_nodes"] for d in metadata_dict.values()) / n,
+        "combined_av_reward_per_episode": {},
+        "session_av_reward_per_episode": {k: v["av_reward_per_episode"] for k, v in metadata_dict.items()},
+        "config": config,
+    }
+
+    episode_averages = [episode["av_reward_per_episode"] for episode in metadata_dict.values()]
+
+    episode = 0
+    for episode_average in episode_averages:
+        episode += 1
+        averaged_data["combined_av_reward_per_episode"][str(episode)] = episode_average
+
+    return averaged_data
+
+
+def _get_df_from_episode_av_reward_dict(data: Dict) -> pl.DataFrame:
+    data: Dict = {"episode": data.keys(), "av_reward": data.values()}
+
+    return (
+        pl.from_dict(data)
+        .with_columns(rolling_mean=pl.col("av_reward").rolling_mean(window_size=25))
+        .rename({"rolling_mean": "rolling_av_reward"})
+    )
+
+
+def _plot_benchmark_metadata(
+    benchmark_metadata_dict: Dict,
+    title: Optional[str] = None,
+    subtitle: Optional[str] = None,
+) -> Figure:
+    if title:
+        if subtitle:
+            title = f"{title} <br>{subtitle}</sup>"
+    else:
+        if subtitle:
+            title = subtitle
+
+    layout = go.Layout(
+        autosize=PLOT_CONFIG["size"]["auto_size"],
+        width=PLOT_CONFIG["size"]["width"],
+        height=PLOT_CONFIG["size"]["height"],
+    )
+    # Create the line graph with a colored line
+    fig = go.Figure(layout=layout)
+    fig.update_layout(template=PLOT_CONFIG["template"])
+
+    for session, av_reward_dict in benchmark_metadata_dict["session_av_reward_per_episode"].items():
+        df = _get_df_from_episode_av_reward_dict(av_reward_dict)
+        fig.add_trace(
+            go.Scatter(
+                x=df["episode"],
+                y=df["av_reward"],
+                mode="lines",
+                name=f"Session {session}",
+                opacity=0.25,
+                line={"color": "#a6a6a6"},
+            )
+        )
+
+    df = _get_df_from_episode_av_reward_dict(benchmark_metadata_dict["combined_av_reward_per_episode"])
+    fig.add_trace(
+        go.Scatter(
+            x=df["episode"], y=df["av_reward"], mode="lines", name="Combined Session Av", line={"color": "#FF0000"}
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=df["episode"],
+            y=df["rolling_av_reward"],
+            mode="lines",
+            name="Rolling Av (Combined Session Av)",
+            line={"color": "#4CBB17"},
+        )
+    )
+
+    # Set the layout of the graph
+    fig.update_layout(
+        xaxis={
+            "title": "Episode",
+            "type": "linear",
+        },
+        yaxis={"title": "Average Reward"},
+        title=title,
+    )
+
+    return fig
+
+
+def _plot_all_benchmarks_combined_session_av(results_directory: Path) -> Figure:
+    """
+    Plot the Benchmark results for each released version of PrimAITE.
+
+    Does this by iterating over the ``benchmark/results`` directory and
+    extracting the benchmark metadata json for each version that has been
+    benchmarked. The combined_av_reward_per_episode is extracted from each,
+    converted into a polars dataframe, and plotted as a scatter line in plotly.
+    """
+    title = "PrimAITE Versions Learning Benchmark"
+    subtitle = "Rolling Av (Combined Session Av)"
+    if title:
+        if subtitle:
+            title = f"{title} <br>{subtitle}</sup>"
+    else:
+        if subtitle:
+            title = subtitle
+    layout = go.Layout(
+        autosize=PLOT_CONFIG["size"]["auto_size"],
+        width=PLOT_CONFIG["size"]["width"],
+        height=PLOT_CONFIG["size"]["height"],
+    )
+    # Create the line graph with a colored line
+    fig = go.Figure(layout=layout)
+    fig.update_layout(template=PLOT_CONFIG["template"])
+
+    for dir in results_directory.iterdir():
+        if dir.is_dir():
+            metadata_file = dir / f"{dir.name}_benchmark_metadata.json"
+            with open(metadata_file, "r") as file:
+                metadata_dict = json.load(file)
+            df = _get_df_from_episode_av_reward_dict(metadata_dict["combined_av_reward_per_episode"])
+
+            fig.add_trace(go.Scatter(x=df["episode"], y=df["rolling_av_reward"], mode="lines", name=dir.name))
+
+    # Set the layout of the graph
+    fig.update_layout(
+        xaxis={
+            "title": "Episode",
+            "type": "linear",
+        },
+        yaxis={"title": "Average Reward"},
+        title=title,
+    )
+    fig["data"][0]["showlegend"] = True
+
+    return fig
+
+
+def build_benchmark_latex_report(
+    benchmark_start_time: datetime, session_metadata: Dict, config_path: Path, results_root_path: Path
+) -> None:
+    """Generates a latex report of the benchmark run."""
+    # generate report folder
+    v_str = f"v{primaite.__version__}"
+
+    version_result_dir = results_root_path / v_str
+    if version_result_dir.exists():
+        shutil.rmtree(version_result_dir)
+    version_result_dir.mkdir(exist_ok=True, parents=True)
+
+    # load the config file as dict
+    with open(config_path, "r") as f:
+        cfg_data = yaml.safe_load(f)
+
+    # generate the benchmark metadata dict
+    benchmark_metadata_dict = _build_benchmark_results_dict(
+        start_datetime=benchmark_start_time, metadata_dict=session_metadata, config=cfg_data
+    )
+
+    with open(version_result_dir / f"{v_str}_benchmark_metadata.json", "w") as file:
+        json.dump(benchmark_metadata_dict, file, indent=4)
+    title = f"PrimAITE v{primaite.__version__.strip()} Learning Benchmark"
+    fig = _plot_benchmark_metadata(benchmark_metadata_dict, title=title)
+    this_version_plot_path = version_result_dir / f"{title}.png"
+    fig.write_image(this_version_plot_path)
+
+    fig = _plot_all_benchmarks_combined_session_av()
+
+    all_version_plot_path = results_root_path / "PrimAITE Versions Learning Benchmark.png"
+    fig.write_image(all_version_plot_path)
+
+    geometry_options = {"tmargin": "2.5cm", "rmargin": "2.5cm", "bmargin": "2.5cm", "lmargin": "2.5cm"}
+    data = benchmark_metadata_dict
+    primaite_version = data["primaite_version"]
+
+    # Create a new document
+    doc = Document("report", geometry_options=geometry_options)
+    # Title
+    doc.preamble.append(Command("title", f"PrimAITE {primaite_version} Learning Benchmark"))
+    doc.preamble.append(Command("author", "PrimAITE Dev Team"))
+    doc.preamble.append(Command("date", datetime.now().date()))
+    doc.append(Command("maketitle"))
+
+    sessions = data["total_sessions"]
+    episodes = data["training_config"]["num_train_episodes"]
+    steps = data["training_config"]["num_train_steps"]
+
+    # Body
+    with doc.create(Section("Introduction")):
+        doc.append(
+            f"PrimAITE v{primaite_version} was benchmarked automatically upon release. Learning rate metrics "
+            f"were captured to be referenced during system-level testing and user acceptance testing (UAT)."
+        )
+        doc.append(
+            f"\nThe benchmarking process consists of running {sessions} training session using the same "
+            f"training and lay down config files. Each session trains an agent for {episodes} episodes, "
+            f"with each episode consisting of {steps} steps."
+        )
+        doc.append(
+            f"\nThe mean reward per episode from each session is captured. This is then used to calculate a "
+            f"combined average reward per episode from the {sessions} individual sessions for smoothing. "
+            f"Finally, a 25-widow rolling average of the combined average reward per session is calculated for "
+            f"further smoothing."
+        )
+
+    with doc.create(Section("System Information")):
+        with doc.create(Subsection("Python")):
+            with doc.create(Tabular("|l|l|")) as table:
+                table.add_hline()
+                table.add_row((bold("Version"), sys.version))
+                table.add_hline()
+        for section, section_data in data["system_info"].items():
+            if section_data:
+                with doc.create(Subsection(section)):
+                    if isinstance(section_data, dict):
+                        with doc.create(Tabular("|l|l|")) as table:
+                            table.add_hline()
+                            for key, value in section_data.items():
+                                table.add_row((bold(key), value))
+                                table.add_hline()
+                    elif isinstance(section_data, list):
+                        headers = section_data[0].keys()
+                        tabs_str = "|".join(["l" for _ in range(len(headers))])
+                        tabs_str = f"|{tabs_str}|"
+                        with doc.create(Tabular(tabs_str)) as table:
+                            table.add_hline()
+                            table.add_row([bold(h) for h in headers])
+                            table.add_hline()
+                            for item in section_data:
+                                table.add_row(item.values())
+                                table.add_hline()
+
+    headers_map = {
+        "total_sessions": "Total Sessions",
+        "total_episodes": "Total Episodes",
+        "total_time_steps": "Total Steps",
+        "av_s_per_session": "Av Session Duration (s)",
+        "av_s_per_step": "Av Step Duration (s)",
+        "av_s_per_100_steps_10_nodes": "Av Duration per 100 Steps per 10 Nodes (s)",
+    }
+    with doc.create(Section("Stats")):
+        with doc.create(Subsection("Benchmark Results")):
+            with doc.create(Tabular("|l|l|")) as table:
+                table.add_hline()
+                for section, header in headers_map.items():
+                    if section.startswith("av_"):
+                        table.add_row((bold(header), f"{data[section]:.4f}"))
+                    else:
+                        table.add_row((bold(header), data[section]))
+                    table.add_hline()
+
+    with doc.create(Section("Graphs")):
+        with doc.create(Subsection(f"PrimAITE {primaite_version} Learning Benchmark Plot")):
+            with doc.create(LatexFigure(position="h!")) as pic:
+                pic.add_image(str(this_version_plot_path))
+                pic.add_caption(f"PrimAITE {primaite_version} Learning Benchmark Plot")
+
+        with doc.create(Subsection("PrimAITE All Versions Learning Benchmark Plot")):
+            with doc.create(LatexFigure(position="h!")) as pic:
+                pic.add_image(str(all_version_plot_path))
+                pic.add_caption("PrimAITE All Versions Learning Benchmark Plot")
+
+    doc.generate_pdf(str(this_version_plot_path).replace(".png", ""), clean_tex=True)
diff --git a/benchmark/utils/utils.py b/benchmark/utils/utils.py
new file mode 100644
index 00000000..f15c4a12
--- /dev/null
+++ b/benchmark/utils/utils.py
@@ -0,0 +1,47 @@
+# © Crown-owned copyright 2023, Defence Science and Technology Laboratory UK
+import platform
+from typing import Dict
+
+import psutil
+from GPUtil import GPUtil
+
+
+def get_size(size_bytes: int) -> str:
+    """
+    Scale bytes to its proper format.
+
+    e.g:
+        1253656 => '1.20MB'
+        1253656678 => '1.17GB'
+
+    :
+    """
+    factor = 1024
+    for unit in ["", "K", "M", "G", "T", "P"]:
+        if size_bytes < factor:
+            return f"{size_bytes:.2f}{unit}B"
+        size_bytes /= factor
+
+
+def _get_system_info() -> Dict:
+    """Builds and returns a dict containing system info."""
+    uname = platform.uname()
+    cpu_freq = psutil.cpu_freq()
+    virtual_mem = psutil.virtual_memory()
+    swap_mem = psutil.swap_memory()
+    gpus = GPUtil.getGPUs()
+    return {
+        "System": {
+            "OS": uname.system,
+            "OS Version": uname.version,
+            "Machine": uname.machine,
+            "Processor": uname.processor,
+        },
+        "CPU": {
+            "Physical Cores": psutil.cpu_count(logical=False),
+            "Total Cores": psutil.cpu_count(logical=True),
+            "Max Frequency": f"{cpu_freq.max:.2f}Mhz",
+        },
+        "Memory": {"Total": get_size(virtual_mem.total), "Swap Total": get_size(swap_mem.total)},
+        "GPU": [{"Name": gpu.name, "Total Memory": f"{gpu.memoryTotal}MB"} for gpu in gpus],
+    }
diff --git a/pyproject.toml b/pyproject.toml
index 9f7eda52..9001cd30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "numpy==1.23.5",
     "platformdirs==3.5.1",
     "plotly==5.15.0",
-    "polars==0.18.4",
+    "polars==0.20.30",
     "prettytable==3.8.0",
     "PyYAML==6.0",
     "stable-baselines3[extra]==2.1.0",
diff --git a/src/primaite/session/environment.py b/src/primaite/session/environment.py
index 4d0544e9..7af9a75d 100644
--- a/src/primaite/session/environment.py
+++ b/src/primaite/session/environment.py
@@ -37,6 +37,8 @@ class PrimaiteGymEnv(gymnasium.Env):
         """Name of the RL agent. Since there should only be one RL agent we can just pull the first and only key."""
         self.episode_counter: int = 0
         """Current episode number."""
+        self.average_reward_per_episode: Dict[int, float] = {}
+        """Average rewards of agents per episode."""
 
     @property
     def agent(self) -> ProxyAgent:
@@ -89,6 +91,8 @@ class PrimaiteGymEnv(gymnasium.Env):
             f"Resetting environment, episode {self.episode_counter}, "
             f"avg. reward: {self.agent.reward_function.total_reward}"
         )
+        self.average_reward_per_episode[self.episode_counter] = self.agent.reward_function.total_reward
+
         if self.io.settings.save_agent_actions:
             all_agent_actions = {name: agent.action_history for name, agent in self.game.agents.items()}
             self.io.write_agent_actions(agent_actions=all_agent_actions, episode=self.episode_counter)