VBAF

1.0.0

VBAF.RL.QLearningAgent.ps1

                                #Requires -Version 5.1

<#

.SYNOPSIS

    Q-Learning Agent for Reinforcement Learning

.DESCRIPTION

    Implements Q-Learning algorithm with epsilon-greedy exploration.

    Uses hashtable for Q-table storage (no separate QTable class needed).

.NOTES

    Part of VBAF - Reinforcement Learning Module

    COMPLETE VERSION - All methods included

#>

# Set base path

$basePath = "C:\Users\henni\OneDrive\WindowsPowerShell"

# Load dependencies

. "$basePath\VBAF.RL.QTable.ps1"

. "$basePath\VBAF.RL.ExperienceReplay.ps1"

class QLearningAgent {

    # Learning parameters

    [string[]]$Actions                  # Available actions

    [hashtable]$QTable                  # Stores learned Q-values (hashtable, not custom class!)

    [double]$LearningRate               # Alpha (α)

    [double]$DiscountFactor             # Gamma (γ)

    [double]$Epsilon                    # Exploration rate

    [double]$EpsilonDecay               # How fast epsilon decreases

    [double]$MinEpsilon                 # Minimum epsilon value

    # Statistics

    [int]$TotalSteps

    [int]$TotalEpisodes

    [double]$TotalReward

    [System.Collections.ArrayList]$EpisodeRewards

    [int]$ExplorationCount

    [int]$ExploitationCount

    [double]$Alpha                      # Alias for LearningRate

    [double]$Gamma                      # Alias for DiscountFactor

    [int]$Episode                       # Alias for TotalEpisodes

    [int]$MemorySize                    # For compatibility

    # Simplified constructor with defaults

    QLearningAgent([string[]]$actions) {

        $this.Actions = $actions

        $this.QTable = @{}

        $this.LearningRate = 0.1        # Default alpha

        $this.DiscountFactor = 0.9      # Default gamma

        $this.Epsilon = 1.0             # Start with full exploration

        $this.EpsilonDecay = 0.995      # Decay 0.5% per episode

        $this.MinEpsilon = 0.01         # Always explore at least 1%

        $this.TotalSteps = 0

        $this.TotalEpisodes = 0

        $this.TotalReward = 0.0

        $this.EpisodeRewards = New-Object System.Collections.ArrayList

        $this.ExplorationCount = 0

        $this.ExploitationCount = 0

        $this.Alpha = $this.LearningRate

        $this.Gamma = $this.DiscountFactor

        $this.Episode = 0

        $this.MemorySize = 0

    }

    # Full constructor with custom parameters

    QLearningAgent([string[]]$actions, [double]$learningRate, [double]$epsilon) {

        $this.Actions = $actions

        $this.QTable = @{}

        $this.LearningRate = $learningRate

        $this.DiscountFactor = 0.9

        $this.Epsilon = $epsilon

        $this.EpsilonDecay = 0.995

        $this.MinEpsilon = 0.01

        $this.TotalSteps = 0

        $this.TotalEpisodes = 0

        $this.TotalReward = 0.0

        $this.EpisodeRewards = New-Object System.Collections.ArrayList

        $this.ExplorationCount = 0

        $this.ExploitationCount = 0

        $this.Alpha = $this.LearningRate

        $this.Gamma = $this.DiscountFactor

        $this.Episode = 0

        $this.MemorySize = 0

    }

    # Get state from context (castle parade specific)

    [string] GetState([hashtable]$context) {

        # State = recent castle types

        if ($context.RecentTypes.Count -eq 0) {

            return "START"

        }

        # Use last 2 castles as state

        $recent = $context.RecentTypes

        if ($recent.Count -eq 1) {

            return $recent[-1]

        } else {

            return "$($recent[-2])|$($recent[-1])"

        }

    }

    # Calculate reward based on outcome

    [double] CalculateReward([hashtable]$outcome) {

        $reward = 0.0

        # Reward for variety (not repeating same castle)

        if ($outcome.IsVaried) {

            $reward += 2.0

        } else {

            $reward -= 1.0  # Penalty for repetition

        }

        # Reward for visual balance

        $reward += $outcome.VisualBalance * 1.5

        # Reward for engagement

        $reward += $outcome.Engagement * 2.0

        return $reward

    }

    # Get Q-value for state-action pair

    [double] GetQValue([string]$state, [string]$action) {

        $key = "$state|$action"

        if ($this.QTable.ContainsKey($key)) {

            return [double]$this.QTable[$key]

        } else {

            return 0.0

        }

    }

    # Set Q-value for state-action pair

    [void] SetQValue([string]$state, [string]$action, [double]$value) {

        $key = "$state|$action"

        $this.QTable[$key] = $value

    }

    # Get best action for a state

    [string] GetBestAction([string]$state) {

        $bestAction = $this.Actions[0]

        $bestValue = $this.GetQValue($state, $bestAction)

        foreach ($action in $this.Actions) {

            $qValue = $this.GetQValue($state, $action)

            if ($qValue -gt $bestValue) {

                $bestValue = $qValue

                $bestAction = $action

            }

        }

        return $bestAction

    }

    # Get max Q-value for a state

    [double] GetMaxQValue([string]$state) {

        $maxValue = $this.GetQValue($state, $this.Actions[0])

        foreach ($action in $this.Actions) {

            $qValue = $this.GetQValue($state, $action)

            if ($qValue -gt $maxValue) {

                $maxValue = $qValue

            }

        }

        return $maxValue

    }

    # Get all Q-values for a state (for analysis)

    [hashtable] GetQValues([string]$state) {

        $values = @{}

        foreach ($action in $this.Actions) {

            $values[$action] = $this.GetQValue($state, $action)

        }

        return $values

    }

    # Choose action using epsilon-greedy

    [string] ChooseAction([string]$state) {

        # Exploration: random action

        if ((Get-Random -Minimum 0.0 -Maximum 1.0) -lt $this.Epsilon) {

            $this.ExplorationCount++

            $randomIndex = Get-Random -Minimum 0 -Maximum $this.Actions.Count

            return $this.Actions[$randomIndex]

        }

        # Exploitation: best known action

        $this.ExploitationCount++

        return $this.GetBestAction($state)

    }

    # Learn from experience (Q-Learning update)

    [void] Learn([string]$state, [string]$action, [double]$reward, [string]$nextState) {

        # Current Q-value

        $currentQ = $this.GetQValue($state, $action)

        # Max Q-value for next state

        $maxNextQ = $this.GetMaxQValue($nextState)

        # Q-Learning update rule:

        # Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]

        $tdTarget = $reward + ($this.DiscountFactor * $maxNextQ)

        $tdError = $tdTarget - $currentQ

        $newQ = $currentQ + ($this.LearningRate * $tdError)

        # Update Q-table

        $this.SetQValue($state, $action, $newQ)

        # Increment steps and accumulate reward

        $this.TotalSteps++

        $this.TotalReward += $reward

    }

    # End episode (decay epsilon)

    [void] EndEpisode([double]$episodeReward) {

        $this.TotalEpisodes++

        $this.Episode = $this.TotalEpisodes

        $this.EpisodeRewards.Add($episodeReward) | Out-Null

        # Decay epsilon

        $this.Epsilon *= $this.EpsilonDecay

        if ($this.Epsilon -lt $this.MinEpsilon) {

            $this.Epsilon = $this.MinEpsilon

        }

    }

    # Decay epsilon manually (reduce exploration over time)

    [void] DecayEpsilon([double]$decayRate) {

        $this.Epsilon *= $decayRate

        if ($this.Epsilon -lt $this.MinEpsilon) {

            $this.Epsilon = $this.MinEpsilon

        }

    }

    # Reset Q-table

    [void] Reset() {

        $this.QTable.Clear()

        $this.TotalSteps = 0

        $this.TotalEpisodes = 0

        $this.TotalReward = 0.0

        $this.EpisodeRewards.Clear()

        $this.Epsilon = 1.0

    }

    # Get statistics

    [hashtable] GetStats() {

        $avgReward = if ($this.TotalEpisodes -gt 0) {

            $this.TotalReward / $this.TotalEpisodes

        } else {

            0.0

        }

        # Recent average (last 10 episodes)

        $recentAvg = 0.0

        if ($this.EpisodeRewards.Count -gt 0) {

            $recentCount = [Math]::Min(10, $this.EpisodeRewards.Count)

            $start = $this.EpisodeRewards.Count - $recentCount

            $recentSum = 0.0

            for ($i = $start; $i -lt $this.EpisodeRewards.Count; $i++) {

                $recentSum += [double]$this.EpisodeRewards[$i]

            }

            $recentAvg = $recentSum / $recentCount

        }

        $explorationRatio = if ($this.TotalSteps -gt 0) {

            $this.Epsilon

        } else {

            1.0

        }

        return @{

            TotalSteps = $this.TotalSteps

            TotalEpisodes = $this.TotalEpisodes

            Episode = $this.TotalEpisodes

            TotalReward = $this.TotalReward

            QTableSize = $this.QTable.Count

            Epsilon = $this.Epsilon

            LearningRate = $this.LearningRate

            AverageReward = $avgReward

            RecentAverageReward = $recentAvg

            ExplorationRatio = $explorationRatio

            ExplorationCount = $this.ExplorationCount

            ExploitationCount = $this.ExploitationCount

            MemorySize = $this.MemorySize

        }

    }

    # Get statistics (alias for compatibility)

    [hashtable] GetStatistics() {

        return $this.GetStats()

    }

}