// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
// All rights reserved.
// @file Scrape the page looking for elements of concern (%ELEMENTS%)
// http://matatk.agrip.org.uk/tests/position-and-width/
// https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate
//
// Some pages like https://www.londonstockexchange.com/stock/NCCL/ncondezi-energy-limited/analysis
// will automatically force a scroll somewhere, so include the position offset
// Lets hope the position doesnt change while we iterate the bbox's, but this is better than nothing
var scroll _y = 0 ;
try {
scroll _y = + document . documentElement . scrollTop || document . body . scrollTop
} catch ( e ) {
console . log ( e ) ;
}
// Include the getXpath script directly, easier than fetching
function getxpath ( e ) {
var n = e ;
if ( n && n . id ) return '//*[@id="' + n . id + '"]' ;
for ( var o = [ ] ; n && Node . ELEMENT _NODE === n . nodeType ; ) {
for ( var i = 0 , r = ! 1 , d = n . previousSibling ; d ; ) d . nodeType !== Node . DOCUMENT _TYPE _NODE && d . nodeName === n . nodeName && i ++ , d = d . previousSibling ;
for ( d = n . nextSibling ; d ; ) {
if ( d . nodeName === n . nodeName ) {
r = ! 0 ;
break
}
d = d . nextSibling
}
o . push ( ( n . prefix ? n . prefix + ":" : "" ) + n . localName + ( i || r ? "[" + ( i + 1 ) + "]" : "" ) ) , n = n . parentNode
}
return o . length ? "/" + o . reverse ( ) . join ( "/" ) : ""
}
const findUpTag = ( el ) => {
let r = el
chained _css = [ ] ;
depth = 0 ;
// Strategy 1: If it's an input, with name, and there's only one, prefer that
if ( el . name !== undefined && el . name . length ) {
var proposed = el . tagName + "[name=" + el . name + "]" ;
var proposed _element = window . document . querySelectorAll ( proposed ) ;
if ( proposed _element . length ) {
if ( proposed _element . length === 1 ) {
return proposed ;
} else {
// Some sites change ID but name= stays the same, we can hit it if we know the index
// Find all the elements that match and work out the input[n]
var n = Array . from ( proposed _element ) . indexOf ( el ) ;
// Return a Playwright selector for nthinput[name=zipcode]
return proposed + " >> nth=" + n ;
}
}
}
// Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
while ( r . parentNode ) {
if ( depth === 5 ) {
break ;
}
if ( '' !== r . id ) {
chained _css . unshift ( "#" + CSS . escape ( r . id ) ) ;
final _selector = chained _css . join ( ' > ' ) ;
// Be sure theres only one, some sites have multiples of the same ID tag :-(
if ( window . document . querySelectorAll ( final _selector ) . length === 1 ) {
return final _selector ;
}
return null ;
} else {
chained _css . unshift ( r . tagName . toLowerCase ( ) ) ;
}
r = r . parentNode ;
depth += 1 ;
}
return null ;
}
// @todo - if it's SVG or IMG, go into image diff mode
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
var size _pos = [ ] ;
// after page fetch, inject this JS
// build a map of all elements and their positions (maybe that only include text?)
var bbox ;
console . log ( "Scanning %ELEMENTS%" ) ;
function collectVisibleElements ( parent , visibleElements ) {
if ( ! parent ) return ; // Base case: if parent is null or undefined, return
// Add the parent itself to the visible elements array if it's of the specified types
const tagName = parent . tagName . toLowerCase ( ) ;
if ( "%ELEMENTS%" . split ( ',' ) . includes ( tagName ) ) {
visibleElements . push ( parent ) ;
}
// Iterate over the parent's children
const children = parent . children ;
for ( let i = 0 ; i < children . length ; i ++ ) {
const child = children [ i ] ;
if (
child . nodeType === Node . ELEMENT _NODE &&
window . getComputedStyle ( child ) . display !== 'none' &&
window . getComputedStyle ( child ) . visibility !== 'hidden' &&
child . offsetWidth >= 0 &&
child . offsetHeight >= 0 &&
window . getComputedStyle ( child ) . contentVisibility !== 'hidden'
) {
// If the child is an element and is visible, recursively collect visible elements
collectVisibleElements ( child , visibleElements ) ;
}
}
}
// Create an array to hold the visible elements
const visibleElementsArray = [ ] ;
// Call collectVisibleElements with the starting parent element
collectVisibleElements ( document . body , visibleElementsArray ) ;
visibleElementsArray . forEach ( function ( element ) {
bbox = element . getBoundingClientRect ( ) ;
// Skip really small ones, and where width or height ==0
if ( bbox [ 'width' ] * bbox [ 'height' ] < 10 ) {
return
}
// Don't include elements that are offset from canvas
if ( bbox [ 'top' ] + scroll _y < 0 || bbox [ 'left' ] < 0 ) {
return
}
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
// it should not traverse when we know we can anchor off just an ID one level up etc..
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
xpath _result = false ;
try {
var d = findUpTag ( element ) ;
if ( d ) {
xpath _result = d ;
}
} catch ( e ) {
console . log ( e ) ;
}
// You could swap it and default to getXpath and then try the smarter one
// default back to the less intelligent one
if ( ! xpath _result ) {
try {
// I've seen on FB and eBay that this doesnt work
// ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
xpath _result = getxpath ( element ) ;
} catch ( e ) {
console . log ( e ) ;
return
}
}
let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
let text = element . textContent . trim ( ) . slice ( 0 , 30 ) . trim ( ) ;
while ( /\n{2,}|\t{2,}/ . test ( text ) ) {
text = text . replace ( /\n{2,}/g , '\n' ) . replace ( /\t{2,}/g , '\t' )
}
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
const hasDigitCurrency = ( /\d/ . test ( text . slice ( 0 , 6 ) ) || /\d/ . test ( text . slice ( - 6 ) ) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,– )/ . test ( text ) ;
size _pos . push ( {
xpath : xpath _result ,
width : Math . round ( bbox [ 'width' ] ) ,
height : Math . round ( bbox [ 'height' ] ) ,
left : Math . floor ( bbox [ 'left' ] ) ,
top : Math . floor ( bbox [ 'top' ] ) + scroll _y ,
// tagName used by Browser Steps
tagName : ( element . tagName ) ? element . tagName . toLowerCase ( ) : '' ,
// tagtype used by Browser Steps
tagtype : ( element . tagName . toLowerCase ( ) === 'input' && element . type ) ? element . type . toLowerCase ( ) : '' ,
isClickable : window . getComputedStyle ( element ) . cursor === "pointer" ,
// Used by the keras trainer
fontSize : window . getComputedStyle ( element ) . getPropertyValue ( 'font-size' ) ,
fontWeight : window . getComputedStyle ( element ) . getPropertyValue ( 'font-weight' ) ,
hasDigitCurrency : hasDigitCurrency ,
label : label ,
} ) ;
} ) ;
// Inject the current one set in the include_filters, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated.
if ( include _filters . length ) {
let results ;
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
for ( const f of include _filters ) {
bbox = false ;
q = false ;
if ( ! f . length ) {
console . log ( "xpath_element_scraper: Empty filter, skipping" ) ;
continue ;
}
try {
// is it xpath?
if ( f . startsWith ( '/' ) || f . startsWith ( 'xpath' ) ) {
var qry _f = f . replace ( /xpath(:|\d:)/ , '' )
console . log ( "[xpath] Scanning for included filter " + qry _f )
let xpathResult = document . evaluate ( qry _f , document , null , XPathResult . ORDERED _NODE _SNAPSHOT _TYPE , null ) ;
results = [ ] ;
for ( let i = 0 ; i < xpathResult . snapshotLength ; i ++ ) {
results . push ( xpathResult . snapshotItem ( i ) ) ;
}
} else {
console . log ( "[css] Scanning for included filter " + f )
console . log ( "[css] Scanning for included filter " + f ) ;
results = document . querySelectorAll ( f ) ;
}
} catch ( e ) {
// Maybe catch DOMException and alert?
console . log ( "xpath_element_scraper: Exception selecting element from filter " + f ) ;
console . log ( e ) ;
}
if ( results != null && results . length ) {
// Iterate over the results
results . forEach ( node => {
// Try to resolve //something/text() back to its /something so we can atleast get the bounding box
try {
if ( typeof node . nodeName == 'string' && node . nodeName === '#text' ) {
node = node . parentElement
}
} catch ( e ) {
console . log ( e )
console . log ( "xpath_element_scraper: #text resolver" )
}
// #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
if ( typeof node . getBoundingClientRect == 'function' ) {
bbox = node . getBoundingClientRect ( ) ;
console . log ( "xpath_element_scraper: Got filter element, scroll from top was " + scroll _y )
} else {
try {
// Try and see we can find its ownerElement
bbox = node . ownerElement . getBoundingClientRect ( ) ;
console . log ( "xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll _y )
} catch ( e ) {
console . log ( e )
console . log ( "xpath_element_scraper: error looking up q.ownerElement" )
}
}
if ( bbox && bbox [ 'width' ] > 0 && bbox [ 'height' ] > 0 ) {
size _pos . push ( {
xpath : f ,
width : parseInt ( bbox [ 'width' ] ) ,
height : parseInt ( bbox [ 'height' ] ) ,
left : parseInt ( bbox [ 'left' ] ) ,
top : parseInt ( bbox [ 'top' ] ) + scroll _y ,
highlight _as _custom _filter : true
} ) ;
}
} ) ;
}
}
}
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
// so that we dont select the wrapping element by mistake and be unable to select what we want
size _pos . sort ( ( a , b ) => ( a . width * a . height > b . width * b . height ) ? 1 : - 1 )
// Window.width required for proper scaling in the frontend
return { 'size_pos' : size _pos , 'browser_width' : window . innerWidth } ;