Functions/GenXdev.Webbrowser/Get-WebbrowserTabDomNodes.ps1

<##############################################################################
Part of PowerShell module : GenXdev.Webbrowser
Original cmdlet filename : Get-WebbrowserTabDomNodes.ps1
Original author : René Vaessen / GenXdev
Version : 1.270.2025
################################################################################
MIT License
 
Copyright 2021-2025 GenXdev
 
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
 
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
 
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
################################################################################>

###############################################################################
<#
.SYNOPSIS
Queries and manipulates DOM nodes in the active browser tab using CSS selectors.
 
.DESCRIPTION
Uses browser automation to find elements matching a CSS selector and returns their
HTML content or executes custom JavaScript on each matched element. This function
is useful for web scraping and browser automation tasks.
 
.PARAMETER QuerySelector
CSS selector string to find matching DOM elements. Uses standard CSS selector
syntax like '#id', '.class', 'tag', etc.
 
.PARAMETER ModifyScript
JavaScript code to execute on each matched element. The code runs as an async
function with parameters:
- e: The matched DOM element
- i: Index of the element (0-based)
- n: Complete NodeList of matching elements
- modifyScript: The script being executed
 
.EXAMPLE
Get HTML of all header divs
Get-WebbrowserTabDomNodes -QuerySelector "div.header"
 
.EXAMPLE
Pause all videos on the page
wl "video" "e.pause()"
#>

function Get-WebbrowserTabDomNodes {

    [CmdletBinding()]
    [System.Diagnostics.CodeAnalysis.SuppressMessageAttribute('PSUseSingularNouns', '')]
    [Alias('wl')]
    param(
        #######################################################################
        [parameter(
            Mandatory = $true,
            Position = 0,
            HelpMessage = 'The query selector string or array of strings to use for selecting DOM nodes'
        )]
        [string[]] $QuerySelector,
        #######################################################################
        [parameter(
            Mandatory = $false,
            Position = 1,
            ValueFromRemainingArguments = $false,
            HelpMessage = "The script to modify the output of the query selector, e.g. e.outerHTML or e.outerHTML='hello world'"
        )]
        [string] $ModifyScript = '',
        #######################################################################
        [Alias('e')]
        [parameter(
            Mandatory = $false,
            HelpMessage = 'Use Microsoft Edge browser'
        )]
        [switch] $Edge,
        ###############################################################################
        [Alias('ch')]
        [parameter(
            Mandatory = $false,
            HelpMessage = 'Use Google Chrome browser'
        )]
        [switch] $Chrome,
        ###############################################################################
        [Parameter(
            HelpMessage = 'Browser page object reference',
            ValueFromPipeline = $false
        )]
        [object] $Page,
        ###############################################################################
        [Parameter(
            HelpMessage = 'Browser session reference object',
            ValueFromPipeline = $false
        )]
        [PSCustomObject] $ByReference,
        ###############################################################################
        [Parameter(
            Mandatory = $false,
            ValueFromPipeline = $false,
            HelpMessage = 'Prevent automatic tab selection'
        )]
        [switch] $NoAutoSelectTab
        ###############################################################################
    )

    begin {
        # convert input parameters to json to prevent script injection attacks
        $jsonModifyScript = $ModifyScript |
            Microsoft.PowerShell.Utility\ConvertTo-Json -Compress -Depth 100 |
            Microsoft.PowerShell.Utility\ConvertTo-Json -Compress

        $jsonQuerySelector = @($QuerySelector) |
            Microsoft.PowerShell.Utility\ConvertTo-Json -Compress -Depth 100 |
            Microsoft.PowerShell.Utility\ConvertTo-Json -Compress

        # javascript that will be executed in the browser context
        # it handles both simple HTML extraction and custom modifications
        $browserScript = @"
        debugger;
let modifyScript = JSON.parse($jsonModifyScript);
let selectors = JSON.parse($jsonQuerySelector);
selectors = selectors instanceof Array ? selectors : [selectors];
let currentSelector = selectors[0];
async function* traverseNodes(node, selectorIndex) {
    if (selectorIndex >= selectors.length) return;
 
    let currentSelector = selectors[selectorIndex];
    let nodes = node.querySelectorAll(currentSelector);
 
    for (let i = 0; i < nodes.length; i++) {
        let currentNode = nodes[i];
 
        // Check for Shadow DOM
        if (currentNode.shadowRoot) {
            yield* traverseNodes(currentNode.shadowRoot, selectorIndex + 1);
            continue;
        }
 
        // Check for IFrames
        if (currentNode.tagName === 'IFRAME') {
            try {
                let iframeDoc = currentNode.contentDocument || currentNode.contentWindow.document;
                yield* traverseNodes(iframeDoc, selectorIndex + 1);
            } catch(e) {
                // Handle cross-origin iframe access errors
                console.warn('Cannot access iframe content');
            }
            continue;
        }
 
        // If this is the last selector, process the node
        if (selectorIndex === selectors.length - 1) {
            if (!!modifyScript && modifyScript != "") {
                try {
                    yield await (async function(e, i, n, modifyScript) {
                        return eval(modifyScript);
                    })(currentNode, i, nodes, modifyScript);
                } catch (e) {
                    yield e+'';
                }
            } else {
                yield currentNode.outerHTML;
            }
        } else {
            // Continue traversing with next selector
            yield* traverseNodes(currentNode, selectorIndex + 1);
        }
    }
}
 
// Start traversal from document root
for await (let result of traverseNodes(document, 0)) {
    yield result;
}
"@

    }


    process {

        # log the operation for debugging purposes
        Microsoft.PowerShell.Utility\Write-Verbose "Executing query '$QuerySelector' with modifier script:`n$ModifyScript"

        # execute the javascript in browser and return results
        $invocationParams = GenXdev.Helpers\Copy-IdenticalParamValues `
            -BoundParameters $PSBoundParameters `
            -FunctionName 'GenXdev.Webbrowser\Invoke-WebbrowserEvaluation'

        $invocationParams.Scripts = $browserScript
        GenXdev.Webbrowser\Invoke-WebbrowserEvaluation @invocationParams
    }

    end {
    }
}