Created in April 2026 PDF: paulgalea.com/projects/machine_learns_to_solve_maze/visualisation.pdf /* ..................... */ /* ........TOOLS........ */ /* ..................... */ Python Adobe Illustrator /* ..................... */ /* ....PYTHON SCRIPT.... */ /* ..................... */ import random import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap as lc from concurrent.futures import ProcessPoolExecutor, as_completed # Best seed: 88 (seeds 1-100 checked) SEED = 88 # For reproducability by ensuring same pseudo-randomness '''Seed 88 was selected subjectively based on the clarity with which its trajectory heatmaps visualise learning progression. Its heatmaps exhibit progressive left-to-right (start-to-finish) intensification, selection of the shortest path, and convergence across the final 100 attempts.''' random.seed(SEED) np.random.seed(SEED) '''The maze is not intended to convey any symbolism or meaning. Its build was governed by the following subjective design preferences to ensure visual clarity and clear development boundaries: - Perimeter-enclosed, excluding start and finish - Solvable (at least one path from start to finish) - A width of 21 cells and a height of 13 cells - Uniform single-cell width for all walls and paths - Internal straight segments capped at five cells - No adjacent internal straight five-cell segments - Acyclic pathing, barring multi-solution branches - However, backtracking is permitted, and wall collisions leave the agent on the same cell Maze used: X X X X X X X X X X X X X X X X X X X X X X . X . . . X . . X . . . . X . X . . . X X . X X X . . . X X X . X . . . X . X X X X . . X . . X X X . . . X X X . X . X . X X X . X . X X . X X X . . . X . . . . . X X . . . . X . . . . X X . X X X . X X . X Start . X X . X X X X . X . . . . X X X . . Finish X . . X . . . X . . . . X X . . . X . X X X . X X X . X X . X X X X . . X . X . . X X . . . X X X . . . X . X . X X X X X . X X X X . . . X X X . . . X . . . X . X . X X . . . X . . . . . X . X . X . . . . . X X X X X X X X X X X X X X X X X X X X X X Theoretically, the agent, as defined below, could solve any such maze.''' S, W, P, F = 0, -1, 0, 1 maze = pd.read_excel(r"Maze.xlsx", usecols="A:U", header=None, nrows=13) start_state = np.argmax(maze.to_numpy().ravel() == 'S') finish_state = np.argmax(maze.to_numpy().ravel() == 'F') maze = maze.replace({'S': S, 'W': W, 'F': F, 'P': P}) maze = maze.to_numpy().astype(int) rewards = maze.flatten() '''The maze has two solution paths, the optimal (shortest) path, and the sub-optimal (longer) path. These are hard-coded to be used as assessment benchmarks for alpha, gamma, and epsilon decay tuning detailed below.''' OPTIMAL_PATH = [126, 127, 148, 169, 190, 191, 192, 213, 214, 215, 236, 237, 238, 239, 240, 219, 198, 197, 176, 155, 156, 157, 158, 137, 138, 117, 96, 95, 74, 53, 32, 33, 34, 55, 56, 57, 78, 99, 100, 101, 102, 103, 124, 145, 146] SUBOPTIMAL_PATH = [126, 127, 148, 169, 190, 191, 192, 213, 214, 215, 236, 237, 238, 239, 240, 219, 198, 197, 176, 155, 156, 157, 158, 137, 138, 139, 140, 161, 182, 181, 202, 223, 224, 225, 246, 247, 248, 249, 250, 229, 208, 187, 186, 165, 144, 145, 146] # Alphas, gammas, and epsilon decays for grid-based hyperparameter tuning alphas = gammas = np.round(np.arange(0.0, 1.01, 0.1), 1) decays = np.round(np.arange(0.99, 0.48, -0.05), 2) # Epsilon decays def build_transitions(maze): n_rows, n_cols = maze.shape n_states = n_rows * n_cols transitions_table = np.zeros((n_states, 4), dtype=int) directions = [(-1, 0), (0, 1), (1, 0), (0, -1)] # u, r, d, l for state in range(n_states): row, col = divmod(state, n_cols) # Maze row and column of state for action, (d_row, d_col) in enumerate(directions): next_row, next_col = row + d_row, col + d_col valid_move = ( 0 <= next_row < n_rows and 0 <= next_col < n_cols and maze[next_row, next_col] != W) if valid_move: next_state = next_row * n_cols + next_col else: next_state = state # If agent walks into a wall it loops transitions_table[state, action] = next_state return transitions_table # Next states per state based on directions transitions = build_transitions(maze) def step(state, action): if random.random() < 0.1: '''The agent 'slips' 10% of the time, where it executes a random unintended action. It evaluates performance based on where it lands but attributes it to its intended action. Alpha balances learning speed against noise from these misattributions.''' # Slip action must be different to intended action actions = [a for a in range(transitions.shape[1]) if a != action] action = random.choice(actions) next_state = transitions[state, action] reward = rewards[next_state] reward -= 0.01 # Step penalty pressures progression to optimal path return next_state, reward def agent(attempts=800, steps=60, epsilon=1, epsilon_decay=0.74, alpha=0.6, gamma=0.5): '''The 800-attempt limit serves as a practical training horizon. In addition to being sufficient for convergence, it enables a clear 3x3 heatmap arrangement. Each panel visualises 100 attempts, bar a cutout in the space of one panel for the legend. The 60-step limit provides sufficient exploration for convergence within 800 attempts, and accommodates slip- and random-action-induced increases beyond the theoretical 44-step optimal path. Along with the maze itself, the 60-step limit, 10% slip chance, and -0.01 step penalty are considered to be part of the environment. Epsilon starts at one as the agent must initially guess actions. With learning, so the agent can exploit its knowledge, epsilon decays, but to a 0.02 minimum to maintain exploration for optimal pathfinding. To constrain the search space for grid-based hyperparameter tuning, epsilon decay was restricted to increments of 0.05 from 0.49 to 0.99. Moreover, alpha and gamma were discretised to one decimal place. Alpha, gamma, and epsilon decay were then selected based on 'stable convergence' across the final panel. Namely, for seeds 1-100, across the final 100 attempts, the greedy policy derived from end-of-attempt Q-tables of the alpha gamma epsilon decay combination: - Follows the optimal path more than any other combination - Follows the optimal or sub-optimal path ≥95% of the time''' q = np.zeros(transitions.shape) attempt_rewards = [] attempt_trajectories = [] attempt_q_tables = [] for attempt in range(attempts): state = start_state attempt_reward = 0 attempt_trajectory = [state] for _ in range(steps): if random.random() < epsilon: # Explore action = random.randint(0, transitions.shape[1] - 1) else: # Exploit # If tied max Q values, chooses random amongst max values action = np.random.choice( np.where(q[state] == q[state].max())[0]) next_state, reward = step(state, action) '''Gamma discounts future rewards based on delay. Notably, it helps to mitigate loops as they delay reaching the reward.''' target = reward + (gamma * np.max(q[next_state])) error = target - q[state, action] q[state, action] += alpha * error attempt_reward += reward state = next_state attempt_trajectory.append(state) if state == finish_state: break attempt_rewards.append(attempt_reward) attempt_trajectories.append(attempt_trajectory) attempt_q_tables.append(q.copy()) '''Sometimes, the agent converges to a sub-optimal path. After convergence, attempts average about 52-55 steps. Thus, a minimum 0.02 epsilon means the agent continues taking roughly one random action per attempt, ensuring eventual optimal path discovery.''' epsilon = max(epsilon * epsilon_decay, 0.02) return attempt_rewards, attempt_trajectories, attempt_q_tables def visualise_performance(attempt_rewards): '''Used for quantitative assessment of convergence speed and stability within the environment.''' plt.plot(range(1, len(attempt_rewards) + 1), attempt_rewards) plt.xlabel('Attempt') plt.ylabel('Reward') plt.show() def visualise_trajectory(trajectories, start, end): '''The agent revisits dead-end paths partly due to slips and random exploration, but primarily as step penalties induce oscillatory bootstrapped Q-values before convergence.''' n_rows, n_cols = maze.shape n_states = n_rows * n_cols dpi, cell_px = 72, 16 fig = plt.figure(figsize=( n_cols * cell_px / dpi, n_rows * cell_px / dpi), dpi=dpi) ax = fig.add_axes([0, 0, 1, 1]) # Ensure maze fills entire figure visit_counts = np.zeros(n_states) for trajectory in trajectories[start-1:end]: for state in trajectory: visit_counts[state] += 1 ax.imshow(visit_counts.reshape(n_rows, n_cols), cmap='viridis') ax.imshow(np.ma.masked_where(maze != W, maze), cmap=lc(['#000'])) ax.axis('off') plt.show() def visualise_legend(): _, n_cols = maze.shape dpi, cell_px = 72, 16 fig = plt.figure(figsize=( (n_cols/2) * cell_px / dpi, cell_px / dpi), dpi=dpi) ax = fig.add_axes([0, 0, 1, 1]) ax.axis('off') fig.colorbar(plt.cm.ScalarMappable(cmap='viridis'), cax=ax, orientation='horizontal') plt.show() attempt_rewards, attempt_trajectories, attempt_q_tables = agent() #visualise_performance(attempt_rewards) for start, end in [[i, i+99] for i in range(1, 800, 100)]: # Heatmaps reflect attempts 1-100, 101-200, 201-300, ... 701-800 visualise_trajectory(attempt_trajectories, start, end) visualise_legend() def evaluate_seed(seed, steps): optimal_solution = np.zeros((len(alphas), len(gammas), len(decays))) solution = np.zeros((len(alphas), len(gammas), len(decays))) for a, alpha in enumerate(alphas): for g, gamma in enumerate(gammas): for d, decay in enumerate(decays): # Reset seed each run as prior runs advance random state random.seed(seed) np.random.seed(seed) _, _, attempt_q_tables = agent(alpha=alpha, gamma=gamma, epsilon_decay=decay) optimal = 0 suboptimal = 0 for q in attempt_q_tables[700:]: # From last 100 attempts state = start_state path = [state] for _ in range(steps): action = np.argmax(q[state]) # Greedy policy state = transitions[state, action] path.append(state) if state == finish_state: break if path == OPTIMAL_PATH: optimal += 1 elif path == SUBOPTIMAL_PATH: suboptimal += 1 optimal_solution[a, g, d] += optimal solution[a, g, d] += optimal + suboptimal return optimal_solution, solution def evaluate_hyperparameter_combinations(steps=60, seeds=100): optimal_solutions = np.zeros((len(alphas), len(gammas), len(decays))) solutions = np.zeros((len(alphas), len(gammas), len(decays))) with ProcessPoolExecutor() as executor: futures = {executor.submit(evaluate_seed, seed, steps): seed for seed in range(1, seeds + 1)} for future in as_completed(futures): optimal_solution, solution = future.result() optimal_solutions += optimal_solution solutions += solution rows = [] for a, alpha in enumerate(alphas): for g, gamma in enumerate(gammas): for d, decay in enumerate(decays): rows.append({ 'Alpha': alpha, 'Gamma': gamma, 'Epsilon Decay': decay, 'Optimal Solution': int(optimal_solutions[a, g, d]), 'Solution': int(solutions[a, g, d]), 'Attempts': seeds * 100}) # 100 attempts per seed df = pd.DataFrame(rows) df.to_csv('hyperparameter_combination_results.csv', index=False) '''Top 10 (of 1,331 hyperparameter combinations) by Optimal Solution, from 10,000 attempts (the final 100 attempts of each of seeds 1-100): Alpha Gamma Epsilon Decay Optimal Solution Solution 0.6 0.5 0.74 Top 8,958 9,533 0.6 0.5 0.69 8,931 9,593 0.6 0.6 0.79 8,826 9,595 0.6 0.5 0.49 8,713 9,507 0.6 0.5 0.94 8,695 9,539 0.6 0.6 0.94 8,685 9,488 0.6 0.6 0.89 8,683 Top 9,632 0.6 0.8 0.54 8,683 9,454 0.6 0.5 0.59 8,659 9,535 0.6 0.6 0.54 8,593 9,554 ... ... ... ... ... Selection: Alpha: 0.6, Gamma: 0.5, Epsilon Decay: 0.74 Epsilon decay can be aggressive, as the agent retains stochasticity via random choices among max-Q ties, slips, and minimum epsilon.''' #if __name__ == "__main__": # evaluate_hyperparameter_combinations()