VBAF.RL.Example-CastleLearning.ps1
|
#Requires -Version 5.1 <# .SYNOPSIS Q-Learning Castle Agent -- Training Demo .DESCRIPTION Demonstrates a Q-Learning agent learning to generate castle sequences. WHAT YOU ARE LEARNING HERE: ============================ This example shows Q-Learning applied to a creative problem -- generating sequences of castle types that are visually varied and engaging. Unlike XOR (which has one correct answer), this is an OPTIMISATION problem -- there is no single right sequence, but some sequences are better than others according to a reward function. THE ENVIRONMENT: ================ State: the last 1-2 castle types chosen (recent history) Actions: choose the next castle type from 8 options Reward: +2 for variety, -1 for repetition, plus visual balance and engagement scores (simulated here with random values) WHAT THE AGENT LEARNS: ======================= Over 100 episodes the agent discovers that: - Repeating the same castle type is penalised - Mixing different types earns higher rewards - Some transitions (e.g. Gothic -> Fairy Tale) score better than others on average THE Q-TABLE GROWS AS THE AGENT EXPLORES: ========================================= Episode 1: Q-table has ~0 entries (nothing visited yet) Episode 10: Q-table growing -- common transitions recorded Episode 100: Q-table stable -- agent exploiting learned values Watch the Q-table size grow during training. Watch epsilon decay from 1.0 (random) toward 0.01 (learned). Watch recent average reward increase as the agent improves. EXPLORATION vs EXPLOITATION IN PRACTICE: ========================================= Early episodes: epsilon ~1.0 -- agent tries everything randomly Middle episodes: epsilon ~0.5 -- mix of random and learned choices Late episodes: epsilon ~0.01 -- agent mostly uses learned Q-values This gradual shift is called the epsilon schedule. Too fast: agent stops exploring before finding good strategies. Too slow: agent wastes time exploring when it already knows what works. REWARD DESIGN NOTE: =================== In this example, visual balance and engagement are SIMULATED with random values. In a real application, these would come from user feedback, aesthetic scoring algorithms, or A/B test results. The random simulation still teaches the variety reward correctly. .NOTES Part of VBAF (Visual AI & Reinforcement Learning Framework) Educational use -- compare output with VBAF.RL.DQN.ps1 to see how neural networks handle larger state spaces. Requires: VBAF.RL.QTable.ps1, VBAF.RL.ExperienceReplay.ps1, VBAF.RL.QLearningAgent.ps1 #> $basePath = $PSScriptRoot . (Join-Path $basePath "VBAF.RL.QTable.ps1") . (Join-Path $basePath "VBAF.RL.ExperienceReplay.ps1") . (Join-Path $basePath "VBAF.RL.QLearningAgent.ps1") Write-Host "" Write-Host "+----------------------------------------------+" -ForegroundColor Cyan Write-Host "| Q-LEARNING CASTLE AGENT - TRAINING DEMO |" -ForegroundColor Cyan Write-Host "+----------------------------------------------+" -ForegroundColor Cyan # THE ACTION SPACE # These are the castle types the agent can choose from. # Each is a discrete action -- the agent picks one per step. # 8 actions x 8 possible states = 64 Q-table entries at most. # This is small enough for a Q-table (no neural network needed). $castleTypes = @( "Gothic", "FairyTale", "Fortress", "Palace", "Wizard", "Cathedral", "Oriental", "Ruins" ) Write-Host "" Write-Host "Available Castle Types (the action space):" -ForegroundColor Yellow foreach ($type in $castleTypes) { Write-Host " - $type" } # CREATE THE AGENT # Default constructor uses: # alpha (learning rate) = 0.1 # gamma (discount) = 0.9 # epsilon = 1.0 (start fully random) Write-Host "" Write-Host "Creating Q-Learning Agent..." -ForegroundColor Yellow $agent = New-Object QLearningAgent -ArgumentList @(,$castleTypes) Write-Host " Alpha (learning rate) : $($agent.Alpha) -- how fast Q-values update" Write-Host " Gamma (discount) : $($agent.Gamma) -- how much future rewards matter" Write-Host " Epsilon (exploration) : $($agent.Epsilon) -- start 100% random" # TRAINING CONFIGURATION # 100 episodes x 10 steps = 1000 total (state, action, reward) interactions. # Each interaction potentially updates one Q-table entry. # After 1000 updates the agent has a reasonable Q-table. $episodes = 100 $stepsPerEpisode = 10 Write-Host "" Write-Host "Training Configuration:" -ForegroundColor Yellow Write-Host " Episodes : $episodes" Write-Host " Steps per episode : $stepsPerEpisode" Write-Host " Total interactions : $($episodes * $stepsPerEpisode)" # recentCastles tracks the last few castle types chosen. # This becomes the STATE that the agent observes. # State = what the agent currently knows about the sequence so far. $recentCastles = New-Object System.Collections.ArrayList Write-Host "" Write-Host ("-" * 60) -ForegroundColor Cyan Write-Host "TRAINING IN PROGRESS" -ForegroundColor Cyan Write-Host ("-" * 60) -ForegroundColor Cyan Write-Host "" # MAIN TRAINING LOOP for ($ep = 1; $ep -le $episodes; $ep++) { $episodeReward = 0.0 for ($step = 1; $step -le $stepsPerEpisode; $step++) { # OBSERVE: convert recent history into a state string # e.g. "Gothic|Fortress" = last two castle types $context = @{ RecentTypes = $recentCastles } $state = $agent.GetState($context) # ACT: epsilon-greedy -- random or best known action $action = $agent.ChooseAction($state) # ENVIRONMENT RESPONSE: # IsVaried = true if this castle differs from the previous one # VisualBalance and Engagement are simulated here with random values. # In a real system these would come from user ratings or scoring. $isVaried = ($recentCastles.Count -eq 0) -or ($recentCastles[-1] -ne $action) $visualBalance = Get-Random -Minimum 0.0 -Maximum 1.0 $engagement = Get-Random -Minimum 0.0 -Maximum 1.0 $outcome = @{ CastleType = $action IsVaried = $isVaried VisualBalance = $visualBalance Engagement = $engagement } # REWARD: shaped to encourage variety and quality $reward = $agent.CalculateReward($outcome) $episodeReward += $reward # UPDATE STATE: add chosen castle to recent history $recentCastles.Add($action) | Out-Null if ($recentCastles.Count -gt 5) { $recentCastles.RemoveAt(0) # Keep only last 5 } # OBSERVE NEXT STATE: what does the agent see now $nextContext = @{ RecentTypes = $recentCastles } $nextState = $agent.GetState($nextContext) # LEARN: update Q(state, action) using Bellman equation $agent.Learn($state, $action, $reward, $nextState) } # END EPISODE: record reward, decay epsilon $agent.EndEpisode($episodeReward) # Print progress every 10 episodes if ($ep % 10 -eq 0 -or $ep -eq 1 -or $ep -eq $episodes) { $stats = $agent.GetStats() $totalActions = $stats.ExplorationCount + $stats.ExploitationCount $exploitPct = if ($totalActions -gt 0) { ($stats.ExploitationCount / $totalActions) * 100 } else { 0.0 } Write-Host ("Episode {0,3} | Reward: {1,6:F2} | Epsilon: {2:F3} | Exploit: {3,5:F1}% | Q-Table: {4,3} entries" -f ` $ep, $episodeReward, $stats.Epsilon, $exploitPct, $stats.QTableSize) } } Write-Host "" Write-Host " Training complete!" -ForegroundColor Green # FINAL RESULTS Write-Host "" Write-Host ("-" * 60) -ForegroundColor Cyan Write-Host "FINAL RESULTS" -ForegroundColor Cyan Write-Host ("-" * 60) -ForegroundColor Cyan $finalStats = $agent.GetStats() Write-Host "" Write-Host "Learning Progress:" -ForegroundColor Yellow Write-Host " Total Episodes : $($finalStats.Episode)" Write-Host " Total Reward : $($finalStats.TotalReward.ToString('F2'))" Write-Host " Average Reward : $($finalStats.AverageReward.ToString('F2'))" Write-Host " Recent Average (last 10): $($finalStats.RecentAverageReward.ToString('F2'))" Write-Host "" Write-Host "Exploration vs Exploitation:" -ForegroundColor Yellow Write-Host " Explorations : $($finalStats.ExplorationCount) (random actions taken)" Write-Host " Exploitations : $($finalStats.ExploitationCount) (learned actions taken)" Write-Host " Final Epsilon : $($finalStats.Epsilon.ToString('F3')) (target: 0.010)" Write-Host "" Write-Host "Knowledge Base:" -ForegroundColor Yellow Write-Host " Q-Table Entries : $($finalStats.QTableSize) (state-action pairs learned)" Write-Host " Experiences Stored : $($finalStats.MemorySize)" # INSPECT WHAT WAS LEARNED # This is the unique advantage of Q-learning over DQN: # you can READ the Q-table and understand exactly what the agent learned. # A DQN stores knowledge in neural network weights -- much harder to inspect. Write-Host "" Write-Host "Learned Q-Values by State:" -ForegroundColor Yellow Write-Host " (Positive = agent prefers this castle type in this state)" -ForegroundColor DarkGray Write-Host " (Negative = agent avoids this castle type in this state)" -ForegroundColor DarkGray $statesFound = @() foreach ($stateKey in $agent.QTable.Keys) { $qValues = $agent.GetQValues($stateKey) $hasLearning = $false foreach ($val in $qValues.Values) { if ($val -ne 0) { $hasLearning = $true; break } } if ($hasLearning) { $statesFound += $stateKey Write-Host "" Write-Host " State '$stateKey':" -ForegroundColor Cyan $sorted = $qValues.GetEnumerator() | Sort-Object Value -Descending foreach ($item in $sorted) { if ($item.Value -ne 0) { $color = if ($item.Value -gt 0) { "Green" } elseif ($item.Value -lt 0) { "Red" } else { "Gray" } Write-Host (" {0,-15} {1,8:F4}" -f $item.Key, $item.Value) -ForegroundColor $color } } } } if ($statesFound.Count -eq 0) { Write-Host "" Write-Host " No learning detected -- Q-Learning update may not be working." -ForegroundColor Red } else { Write-Host "" Write-Host " Learning detected in states: $($statesFound -join ', ')" -ForegroundColor Green } # IMPROVEMENT CHECK # If recent average > overall average, the agent improved during training. # This is the key sign that Q-learning is working correctly. if ($finalStats.RecentAverageReward -gt $finalStats.AverageReward) { Write-Host "" Write-Host " Agent IMPROVED -- recent rewards higher than overall average!" -ForegroundColor Green Write-Host " Q-learning successfully shifted from exploration to exploitation." -ForegroundColor DarkGray } else { Write-Host "" Write-Host " Agent performance stable -- try more episodes for further improvement." -ForegroundColor Yellow } Write-Host "" # ============================================================================ # WHAT TO TRY NEXT: # ================= # 1. Increase episodes to 500 -- watch Q-table grow and epsilon reach 0.01 # 2. Change stepsPerEpisode to 20 -- more interactions per episode # 3. Print agent.QTable directly to see every learned value: # $agent.QTable | Format-Table # 4. Compare with DQN on the same problem -- does neural network learn faster # 5. Move on to: VBAF.Business.Test.CompanyMarket.ps1 -- multi-agent competition # ============================================================================ |