Created in April 2026
PDF: paulgalea.com/projects/machine_learns_to_solve_maze/visualisation.pdf

/* ..................... */
/* ........TOOLS........ */
/* ..................... */

Python
Adobe Illustrator

/* ..................... */
/* ....PYTHON SCRIPT.... */
/* ..................... */

import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap as lc
from concurrent.futures import ProcessPoolExecutor, as_completed

# Best seed: 88 (seeds 1-100 checked)
SEED = 88 # For reproducability by ensuring same pseudo-randomness
'''Seed 88 was selected subjectively based on the clarity with which its
trajectory heatmaps visualise learning progression. Its heatmaps exhibit
progressive left-to-right (start-to-finish) intensification, selection of
the shortest path, and convergence across the final 100 attempts.'''
random.seed(SEED)
np.random.seed(SEED)

'''The maze is not intended to convey any symbolism or meaning. Its build
was governed by the following subjective design preferences to ensure
visual clarity and clear development boundaries:

    - Perimeter-enclosed, excluding start and finish
    - Solvable (at least one path from start to finish)
    - A width of 21 cells and a height of 13 cells
    - Uniform single-cell width for all walls and paths
    - Internal straight segments capped at five cells
    - No adjacent internal straight five-cell segments
    - Acyclic pathing, barring multi-solution branches
        - However, backtracking is permitted, and wall
          collisions leave the agent on the same cell

Maze used:

        X X X X X X X X X X X X X X X X X X X X X
        X . X . . . X . . X . . . . X . X . . . X
        X . X X X . . . X X X . X . . . X . X X X
        X . . X . . X X X . . . X X X . X . X . X
        X X . X . X X . X X X . . . X . . . . . X
        X . . . . X . . . . X X . X X X . X X . X
    Start . X X . X X X X . X . . . . X X X . . Finish
        X . . X . . . X . . . . X X . . . X . X X
        X . X X X . X X . X X X X . . X . X . . X
        X . . . X X X . . . X . X . X X X X X . X
        X X X . . . X X X . . . X . . . X . X . X
        X . . . X . . . . . X . X . X . . . . . X
        X X X X X X X X X X X X X X X X X X X X X

Theoretically, the agent, as defined below, could solve any such maze.'''
S, W, P, F = 0, -1, 0, 1
maze = pd.read_excel(r"Maze.xlsx", usecols="A:U", header=None, nrows=13)
start_state = np.argmax(maze.to_numpy().ravel() == 'S')
finish_state = np.argmax(maze.to_numpy().ravel() == 'F')
maze = maze.replace({'S': S, 'W': W, 'F': F, 'P': P})
maze = maze.to_numpy().astype(int)
rewards = maze.flatten()
'''The maze has two solution paths, the optimal (shortest) path, and the
sub-optimal (longer) path. These are hard-coded to be used as assessment
benchmarks for alpha, gamma, and epsilon decay tuning detailed below.'''
OPTIMAL_PATH = [126, 127, 148, 169, 190, 191, 192, 213, 214, 215, 236,
    237, 238, 239, 240, 219, 198, 197, 176, 155, 156, 157, 158, 137, 138,
    117, 96, 95, 74, 53, 32, 33, 34, 55, 56, 57, 78, 99, 100, 101, 102,
    103, 124, 145, 146]
SUBOPTIMAL_PATH = [126, 127, 148, 169, 190, 191, 192, 213, 214, 215, 236,
    237, 238, 239, 240, 219, 198, 197, 176, 155, 156, 157, 158, 137, 138,
    139, 140, 161, 182, 181, 202, 223, 224, 225, 246, 247, 248, 249, 250,
    229, 208, 187, 186, 165, 144, 145, 146]
# Alphas, gammas, and epsilon decays for grid-based hyperparameter tuning
alphas = gammas = np.round(np.arange(0.0, 1.01, 0.1), 1)
decays = np.round(np.arange(0.99, 0.48, -0.05), 2) # Epsilon decays

def build_transitions(maze):
    n_rows, n_cols = maze.shape
    n_states = n_rows * n_cols
    transitions_table = np.zeros((n_states, 4), dtype=int)
    directions = [(-1, 0), (0, 1), (1, 0), (0, -1)] # u, r, d, l
    for state in range(n_states):
        row, col = divmod(state, n_cols) # Maze row and column of state
        for action, (d_row, d_col) in enumerate(directions):
            next_row, next_col = row + d_row, col + d_col
            valid_move = (
                0 <= next_row < n_rows and
                0 <= next_col < n_cols and
                maze[next_row, next_col] != W)
            if valid_move:
                next_state = next_row * n_cols + next_col
            else:
                next_state = state # If agent walks into a wall it loops
            transitions_table[state, action] = next_state
    return transitions_table # Next states per state based on directions

transitions = build_transitions(maze)

def step(state, action):
    if random.random() < 0.1:
        '''The agent 'slips' 10% of the time, where it executes a random
        unintended action. It evaluates performance based on where it
        lands but attributes it to its intended action. Alpha balances
        learning speed against noise from these misattributions.'''
        # Slip action must be different to intended action
        actions = [a for a in range(transitions.shape[1]) if a != action]
        action = random.choice(actions)
    next_state = transitions[state, action]
    reward = rewards[next_state]
    reward -= 0.01 # Step penalty pressures progression to optimal path
    return next_state, reward

def agent(attempts=800, steps=60,
          epsilon=1, epsilon_decay=0.74, alpha=0.6, gamma=0.5):
    '''The 800-attempt limit serves as a practical training horizon. In
    addition to being sufficient for convergence, it enables a clear 3x3
    heatmap arrangement. Each panel visualises 100 attempts, bar a cutout
    in the space of one panel for the legend. 

    The 60-step limit provides sufficient exploration for convergence
    within 800 attempts, and accommodates slip- and random-action-induced
    increases beyond the theoretical 44-step optimal path.

    Along with the maze itself, the 60-step limit, 10% slip chance, and
    -0.01 step penalty are considered to be part of the environment.

    Epsilon starts at one as the agent must initially guess actions. With
    learning, so the agent can exploit its knowledge, epsilon decays, but
    to a 0.02 minimum to maintain exploration for optimal pathfinding.

    To constrain the search space for grid-based hyperparameter tuning,
    epsilon decay was restricted to increments of 0.05 from 0.49 to 0.99.
    Moreover, alpha and gamma were discretised to one decimal place.

    Alpha, gamma, and epsilon decay were then selected based on 'stable
    convergence' across the final panel. Namely, for seeds 1-100, across
    the final 100 attempts, the greedy policy derived from end-of-attempt
    Q-tables of the alpha gamma epsilon decay combination:
    
        - Follows the optimal path more than any other combination
        - Follows the optimal or sub-optimal path ≥95% of the time'''
    q = np.zeros(transitions.shape)
    attempt_rewards = []
    attempt_trajectories = []
    attempt_q_tables = []
    for attempt in range(attempts):
        state = start_state
        attempt_reward = 0
        attempt_trajectory = [state]
        for _ in range(steps):
            if random.random() < epsilon: # Explore
                action = random.randint(0, transitions.shape[1] - 1)
            else: # Exploit
                # If tied max Q values, chooses random amongst max values
                action = np.random.choice(
                    np.where(q[state] == q[state].max())[0])
            next_state, reward = step(state, action)
            '''Gamma discounts future rewards based on delay. Notably, it
            helps to mitigate loops as they delay reaching the reward.'''
            target = reward + (gamma * np.max(q[next_state]))
            error = target - q[state, action]
            q[state, action] += alpha * error
            attempt_reward += reward
            state = next_state
            attempt_trajectory.append(state)
            if state == finish_state: break
        attempt_rewards.append(attempt_reward)
        attempt_trajectories.append(attempt_trajectory)
        attempt_q_tables.append(q.copy())
        '''Sometimes, the agent converges to a sub-optimal path. After
        convergence, attempts average about 52-55 steps. Thus, a minimum
        0.02 epsilon means the agent continues taking roughly one random
        action per attempt, ensuring eventual optimal path discovery.'''
        epsilon = max(epsilon * epsilon_decay, 0.02)
    return attempt_rewards, attempt_trajectories, attempt_q_tables

def visualise_performance(attempt_rewards):
    '''Used for quantitative assessment of convergence speed and
    stability within the environment.'''
    plt.plot(range(1, len(attempt_rewards) + 1), attempt_rewards)
    plt.xlabel('Attempt')
    plt.ylabel('Reward')
    plt.show()

def visualise_trajectory(trajectories, start, end):
    '''The agent revisits dead-end paths partly due to slips and random
    exploration, but primarily as step penalties induce oscillatory
    bootstrapped Q-values before convergence.'''
    n_rows, n_cols = maze.shape
    n_states = n_rows * n_cols
    dpi, cell_px = 72, 16
    fig = plt.figure(figsize=(
        n_cols * cell_px / dpi, n_rows * cell_px / dpi), dpi=dpi)
    ax = fig.add_axes([0, 0, 1, 1]) # Ensure maze fills entire figure
    visit_counts = np.zeros(n_states)
    for trajectory in trajectories[start-1:end]:
        for state in trajectory:
            visit_counts[state] += 1
    ax.imshow(visit_counts.reshape(n_rows, n_cols), cmap='viridis')
    ax.imshow(np.ma.masked_where(maze != W, maze), cmap=lc(['#000']))
    ax.axis('off')
    plt.show()

def visualise_legend():
    _, n_cols = maze.shape
    dpi, cell_px = 72, 16
    fig = plt.figure(figsize=(
        (n_cols/2) * cell_px / dpi, cell_px / dpi), dpi=dpi)
    ax = fig.add_axes([0, 0, 1, 1])
    ax.axis('off')
    fig.colorbar(plt.cm.ScalarMappable(cmap='viridis'),
                 cax=ax, orientation='horizontal')
    plt.show()

attempt_rewards, attempt_trajectories, attempt_q_tables = agent()
#visualise_performance(attempt_rewards)
for start, end in [[i, i+99] for i in range(1, 800, 100)]:
    # Heatmaps reflect attempts 1-100, 101-200, 201-300, ... 701-800
    visualise_trajectory(attempt_trajectories, start, end)
visualise_legend()

def evaluate_seed(seed, steps):
    optimal_solution = np.zeros((len(alphas), len(gammas), len(decays)))
    solution = np.zeros((len(alphas), len(gammas), len(decays)))
    for a, alpha in enumerate(alphas):
        for g, gamma in enumerate(gammas):
            for d, decay in enumerate(decays):
                # Reset seed each run as prior runs advance random state
                random.seed(seed)
                np.random.seed(seed)
                _, _, attempt_q_tables = agent(alpha=alpha, gamma=gamma,
                                               epsilon_decay=decay)
                optimal = 0
                suboptimal = 0
                for q in attempt_q_tables[700:]: # From last 100 attempts
                    state = start_state
                    path = [state]
                    for _ in range(steps):
                        action = np.argmax(q[state]) # Greedy policy
                        state = transitions[state, action]
                        path.append(state)
                        if state == finish_state: break
                    if path == OPTIMAL_PATH: optimal += 1
                    elif path == SUBOPTIMAL_PATH: suboptimal += 1
                optimal_solution[a, g, d] += optimal
                solution[a, g, d] += optimal + suboptimal
    return optimal_solution, solution

def evaluate_hyperparameter_combinations(steps=60, seeds=100):
    optimal_solutions = np.zeros((len(alphas), len(gammas), len(decays)))
    solutions = np.zeros((len(alphas), len(gammas), len(decays)))
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(evaluate_seed, seed, steps):
                   seed for seed in range(1, seeds + 1)}
        for future in as_completed(futures):
            optimal_solution, solution = future.result()
            optimal_solutions += optimal_solution
            solutions += solution
    rows = []
    for a, alpha in enumerate(alphas):
        for g, gamma in enumerate(gammas):
            for d, decay in enumerate(decays):
                rows.append({
                    'Alpha': alpha,
                    'Gamma': gamma,
                    'Epsilon Decay': decay,
                    'Optimal Solution': int(optimal_solutions[a, g, d]),
                    'Solution': int(solutions[a, g, d]),
                    'Attempts': seeds * 100}) # 100 attempts per seed
    df = pd.DataFrame(rows)
    df.to_csv('hyperparameter_combination_results.csv', index=False)
    '''Top 10 (of 1,331 hyperparameter combinations) by Optimal Solution,
    from 10,000 attempts (the final 100 attempts of each of seeds 1-100): 

        Alpha   Gamma   Epsilon Decay    Optimal Solution    Solution
          0.6     0.5            0.74           Top 8,958       9,533
          0.6     0.5            0.69               8,931       9,593
          0.6     0.6            0.79               8,826       9,595
          0.6     0.5            0.49               8,713       9,507
          0.6     0.5            0.94               8,695       9,539
          0.6     0.6            0.94               8,685       9,488
          0.6     0.6            0.89               8,683   Top 9,632
          0.6     0.8            0.54               8,683       9,454
          0.6     0.5            0.59               8,659       9,535
          0.6     0.6            0.54               8,593       9,554
          ...     ...             ...                 ...         ...

    Selection: Alpha: 0.6, Gamma: 0.5, Epsilon Decay: 0.74

    Epsilon decay can be aggressive, as the agent retains stochasticity
    via random choices among max-Q ties, slips, and minimum epsilon.'''

#if __name__ == "__main__":
#    evaluate_hyperparameter_combinations()