From a62043e086eaf067cb90c8962cb5a9e709829d80 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 21 Feb 2024 11:21:43 +0100 Subject: [PATCH] Fetching - restock detecting and visual selector scraper - Fixes scraping of elements that are not visible --- .../res/stock-not-in-stock.js | 88 ++++++++++--- .../res/xpath_element_scraper.js | 116 +++++++++++------- 2 files changed, 138 insertions(+), 66 deletions(-) diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 3f7cc600..c240b22c 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -10,7 +10,7 @@ function isItemInStock() { const outOfStockTexts = [ ' أخبرني عندما يتوفر', '0 in stock', - 'actuellement indisponible', + 'actuellement indisponible', 'agotado', 'article épuisé', 'artikel zurzeit vergriffen', @@ -45,9 +45,9 @@ function isItemInStock() { 'no tickets available', 'not available', 'not currently available', - 'not in stock', + 'not in stock', 'notify me when available', - 'notify when available', + 'notify when available', 'não estamos a aceitar encomendas', 'out of stock', 'out-of-stock', @@ -70,7 +70,9 @@ function isItemInStock() { '품절' ]; + const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + function getElementBaseText(element) { // .textContent can include text from children which may give the wrong results // scan only immediate TEXT_NODEs, which will be a child of the element @@ -81,29 +83,69 @@ function isItemInStock() { return text.toLowerCase().trim(); } - const negateOutOfStockRegex = new RegExp('([0-9] in stock|add to cart)', 'ig'); + const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig'); // The out-of-stock or in-stock-text is generally always above-the-fold // and often below-the-fold is a list of related products that may or may not contain trigger text // so it's good to filter to just the 'above the fold' elements // and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist - const elementsToScan = Array.from(document.getElementsByTagName('*')).filter(element => element.getBoundingClientRect().top + window.scrollY <= vh && element.getBoundingClientRect().top + window.scrollY >= 100); + + +// @todo - if it's SVG or IMG, go into image diff mode +// %ELEMENTS% replaced at injection time because different interfaces use it with different settings + + console.log("Scanning %ELEMENTS%"); + + function collectVisibleElements(parent, visibleElements) { + if (!parent) return; // Base case: if parent is null or undefined, return + + // Add the parent itself to the visible elements array if it's of the specified types + visibleElements.push(parent); + + // Iterate over the parent's children + const children = parent.children; + for (let i = 0; i < children.length; i++) { + const child = children[i]; + if ( + child.nodeType === Node.ELEMENT_NODE && + window.getComputedStyle(child).display !== 'none' && + window.getComputedStyle(child).visibility !== 'hidden' && + child.offsetWidth >= 0 && + child.offsetHeight >= 0 && + window.getComputedStyle(child).contentVisibility !== 'hidden' + ) { + // If the child is an element and is visible, recursively collect visible elements + collectVisibleElements(child, visibleElements); + } + } + } + + const elementsToScan = []; + collectVisibleElements(document.body, elementsToScan); var elementText = ""; // REGEXS THAT REALLY MEAN IT'S IN STOCK for (let i = elementsToScan.length - 1; i >= 0; i--) { const element = elementsToScan[i]; + + // outside the 'fold' or some weird text in the heading area + // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden + if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) { + continue + } + elementText = ""; if (element.tagName.toLowerCase() === "input") { - elementText = element.value.toLowerCase(); + elementText = element.value.toLowerCase().trim(); } else { elementText = getElementBaseText(element); } if (elementText.length) { // try which ones could mean its in stock - if (negateOutOfStockRegex.test(elementText)) { + if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) { + console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`) return 'Possibly in stock'; } } @@ -112,28 +154,34 @@ function isItemInStock() { // OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK for (let i = elementsToScan.length - 1; i >= 0; i--) { const element = elementsToScan[i]; - if (element.offsetWidth > 0 || element.offsetHeight > 0 || element.getClientRects().length > 0) { - elementText = ""; - if (element.tagName.toLowerCase() === "input") { - elementText = element.value.toLowerCase(); - } else { - elementText = getElementBaseText(element); - } + // outside the 'fold' or some weird text in the heading area + // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden + if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { + continue + } + elementText = ""; + if (element.tagName.toLowerCase() === "input") { + elementText = element.value.toLowerCase().trim(); + } else { + elementText = getElementBaseText(element); + } - if (elementText.length) { - // and these mean its out of stock - for (const outOfStockText of outOfStockTexts) { - if (elementText.includes(outOfStockText)) { - return outOfStockText; // item is out of stock - } + if (elementText.length) { + // and these mean its out of stock + for (const outOfStockText of outOfStockTexts) { + if (elementText.includes(outOfStockText)) { + console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) + return outOfStockText; // item is out of stock } } } } + console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`) return 'Possibly in stock'; // possibly in stock, cant decide otherwise. } // returns the element text that makes it think it's out of stock return isItemInStock().trim() + diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index efe593d0..326889ea 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -16,24 +16,23 @@ try { } - // Include the getXpath script directly, easier than fetching function getxpath(e) { - var n = e; - if (n && n.id) return '//*[@id="' + n.id + '"]'; - for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) { - for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling; - for (d = n.nextSibling; d;) { - if (d.nodeName === n.nodeName) { - r = !0; - break - } - d = d.nextSibling + var n = e; + if (n && n.id) return '//*[@id="' + n.id + '"]'; + for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) { + for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling; + for (d = n.nextSibling; d;) { + if (d.nodeName === n.nodeName) { + r = !0; + break } - o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode + d = d.nextSibling } - return o.length ? "/" + o.reverse().join("/") : "" + o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode } + return o.length ? "/" + o.reverse().join("/") : "" +} const findUpTag = (el) => { let r = el @@ -59,14 +58,14 @@ const findUpTag = (el) => { // Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4 while (r.parentNode) { - if (depth == 5) { + if (depth === 5) { break; } if ('' !== r.id) { chained_css.unshift("#" + CSS.escape(r.id)); final_selector = chained_css.join(' > '); // Be sure theres only one, some sites have multiples of the same ID tag :-( - if (window.document.querySelectorAll(final_selector).length == 1) { + if (window.document.querySelectorAll(final_selector).length === 1) { return final_selector; } return null; @@ -82,30 +81,60 @@ const findUpTag = (el) => { // @todo - if it's SVG or IMG, go into image diff mode // %ELEMENTS% replaced at injection time because different interfaces use it with different settings -var elements = window.document.querySelectorAll("%ELEMENTS%"); + var size_pos = []; // after page fetch, inject this JS // build a map of all elements and their positions (maybe that only include text?) var bbox; -for (var i = 0; i < elements.length; i++) { - bbox = elements[i].getBoundingClientRect(); +console.log("Scanning %ELEMENTS%"); + +function collectVisibleElements(parent, visibleElements) { + if (!parent) return; // Base case: if parent is null or undefined, return + - // Exclude items that are not interactable or visible - if(elements[i].style.opacity === "0") { - continue + // Add the parent itself to the visible elements array if it's of the specified types + const tagName = parent.tagName.toLowerCase(); + if ("%ELEMENTS%".split(',').includes(tagName)) { + visibleElements.push(parent); } - if(elements[i].style.display === "none" || elements[i].style.pointerEvents === "none" ) { - continue + + // Iterate over the parent's children + const children = parent.children; + for (let i = 0; i < children.length; i++) { + const child = children[i]; + if ( + child.nodeType === Node.ELEMENT_NODE && + window.getComputedStyle(child).display !== 'none' && + window.getComputedStyle(child).visibility !== 'hidden' && + child.offsetWidth >= 0 && + child.offsetHeight >= 0 && + window.getComputedStyle(child).contentVisibility !== 'hidden' + ) { + // If the child is an element and is visible, recursively collect visible elements + collectVisibleElements(child, visibleElements); + } } +} + +// Create an array to hold the visible elements +const visibleElementsArray = []; + +// Call collectVisibleElements with the starting parent element +collectVisibleElements(document.body, visibleElementsArray); + + +visibleElementsArray.forEach(function (element) { + + bbox = element.getBoundingClientRect(); // Skip really small ones, and where width or height ==0 - if (bbox['width'] * bbox['height'] < 100) { - continue; + if (bbox['width'] * bbox['height'] < 10) { + return } // Don't include elements that are offset from canvas - if (bbox['top']+scroll_y < 0 || bbox['left'] < 0) { - continue; + if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) { + return } // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes @@ -114,46 +143,41 @@ for (var i = 0; i < elements.length; i++) { // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. xpath_result = false; - try { - var d = findUpTag(elements[i]); + var d = findUpTag(element); if (d) { xpath_result = d; } } catch (e) { console.log(e); } - // You could swap it and default to getXpath and then try the smarter one // default back to the less intelligent one if (!xpath_result) { try { // I've seen on FB and eBay that this doesnt work // ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), :67:20) at UtilityScript.evaluate (:159:18) at UtilityScript. (:1:44) - xpath_result = getxpath(elements[i]); + xpath_result = getxpath(element); } catch (e) { console.log(e); - continue; + return } } - if (window.getComputedStyle(elements[i]).visibility === "hidden") { - continue; - } - // @todo Possible to ONLY list where it's clickable to save JSON xfer size size_pos.push({ xpath: xpath_result, width: Math.round(bbox['width']), height: Math.round(bbox['height']), left: Math.floor(bbox['left']), - top: Math.floor(bbox['top'])+scroll_y, - tagName: (elements[i].tagName) ? elements[i].tagName.toLowerCase() : '', - tagtype: (elements[i].tagName == 'INPUT' && elements[i].type) ? elements[i].type.toLowerCase() : '', - isClickable: (elements[i].onclick) || window.getComputedStyle(elements[i]).cursor == "pointer" + top: Math.floor(bbox['top']) + scroll_y, + tagName: (element.tagName) ? element.tagName.toLowerCase() : '', + tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', + isClickable: window.getComputedStyle(element).cursor == "pointer" }); -} +}); + // Inject the current one set in the include_filters, which may be a CSS rule // used for displaying the current one in VisualSelector, where its not one we generated. @@ -180,7 +204,7 @@ if (include_filters.length) { } } catch (e) { // Maybe catch DOMException and alert? - console.log("xpath_element_scraper: Exception selecting element from filter "+f); + console.log("xpath_element_scraper: Exception selecting element from filter " + f); console.log(e); } @@ -210,8 +234,8 @@ if (include_filters.length) { } } } - - if(!q) { + + if (!q) { console.log("xpath_element_scraper: filter element " + f + " was not found"); } @@ -221,7 +245,7 @@ if (include_filters.length) { width: parseInt(bbox['width']), height: parseInt(bbox['height']), left: parseInt(bbox['left']), - top: parseInt(bbox['top'])+scroll_y + top: parseInt(bbox['top']) + scroll_y }); } } @@ -229,7 +253,7 @@ if (include_filters.length) { // Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area // so that we dont select the wrapping element by mistake and be unable to select what we want -size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1) +size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1) // Window.width required for proper scaling in the frontend return {'size_pos': size_pos, 'browser_width': window.innerWidth};