From a62043e086eaf067cb90c8962cb5a9e709829d80 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Wed, 21 Feb 2024 11:21:43 +0100
Subject: [PATCH] Fetching - restock detecting and visual selector scraper -
 Fixes scraping of elements that are not visible

---
 .../res/stock-not-in-stock.js                 |  88 ++++++++++---
 .../res/xpath_element_scraper.js              | 116 +++++++++++-------
 2 files changed, 138 insertions(+), 66 deletions(-)

diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
index 3f7cc600..c240b22c 100644
--- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js
+++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
@@ -10,7 +10,7 @@ function isItemInStock() {
     const outOfStockTexts = [
         ' أخبرني عندما يتوفر',
         '0 in stock',
-        'actuellement indisponible',        
+        'actuellement indisponible',
         'agotado',
         'article épuisé',
         'artikel zurzeit vergriffen',
@@ -45,9 +45,9 @@ function isItemInStock() {
         'no tickets available',
         'not available',
         'not currently available',
-        'not in stock',        
+        'not in stock',
         'notify me when available',
-        'notify when available',            
+        'notify when available',
         'não estamos a aceitar encomendas',
         'out of stock',
         'out-of-stock',
@@ -70,7 +70,9 @@ function isItemInStock() {
         '품절'
     ];
 
+
     const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
+
     function getElementBaseText(element) {
         // .textContent can include text from children which may give the wrong results
         // scan only immediate TEXT_NODEs, which will be a child of the element
@@ -81,29 +83,69 @@ function isItemInStock() {
         return text.toLowerCase().trim();
     }
 
-    const negateOutOfStockRegex = new RegExp('([0-9] in stock|add to cart)', 'ig');
+    const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
 
     // The out-of-stock or in-stock-text is generally always above-the-fold
     // and often below-the-fold is a list of related products that may or may not contain trigger text
     // so it's good to filter to just the 'above the fold' elements
     // and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
-    const elementsToScan = Array.from(document.getElementsByTagName('*')).filter(element => element.getBoundingClientRect().top + window.scrollY <= vh && element.getBoundingClientRect().top + window.scrollY >= 100);
+
+
+// @todo - if it's SVG or IMG, go into image diff mode
+// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
+
+    console.log("Scanning %ELEMENTS%");
+
+    function collectVisibleElements(parent, visibleElements) {
+        if (!parent) return; // Base case: if parent is null or undefined, return
+
+        // Add the parent itself to the visible elements array if it's of the specified types
+        visibleElements.push(parent);
+
+        // Iterate over the parent's children
+        const children = parent.children;
+        for (let i = 0; i < children.length; i++) {
+            const child = children[i];
+            if (
+                child.nodeType === Node.ELEMENT_NODE &&
+                window.getComputedStyle(child).display !== 'none' &&
+                window.getComputedStyle(child).visibility !== 'hidden' &&
+                child.offsetWidth >= 0 &&
+                child.offsetHeight >= 0 &&
+                window.getComputedStyle(child).contentVisibility !== 'hidden'
+            ) {
+                // If the child is an element and is visible, recursively collect visible elements
+                collectVisibleElements(child, visibleElements);
+            }
+        }
+    }
+
+    const elementsToScan = [];
+    collectVisibleElements(document.body, elementsToScan);
 
     var elementText = "";
 
     // REGEXS THAT REALLY MEAN IT'S IN STOCK
     for (let i = elementsToScan.length - 1; i >= 0; i--) {
         const element = elementsToScan[i];
+
+        // outside the 'fold' or some weird text in the heading area
+        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
+        if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
+            continue
+        }
+
         elementText = "";
         if (element.tagName.toLowerCase() === "input") {
-            elementText = element.value.toLowerCase();
+            elementText = element.value.toLowerCase().trim();
         } else {
             elementText = getElementBaseText(element);
         }
 
         if (elementText.length) {
             // try which ones could mean its in stock
-            if (negateOutOfStockRegex.test(elementText)) {
+            if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
+                console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
                 return 'Possibly in stock';
             }
         }
@@ -112,28 +154,34 @@ function isItemInStock() {
     // OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
     for (let i = elementsToScan.length - 1; i >= 0; i--) {
         const element = elementsToScan[i];
-        if (element.offsetWidth > 0 || element.offsetHeight > 0 || element.getClientRects().length > 0) {
-            elementText = "";
-            if (element.tagName.toLowerCase() === "input") {
-                elementText = element.value.toLowerCase();
-            } else {
-                elementText = getElementBaseText(element);
-            }
+        // outside the 'fold' or some weird text in the heading area
+        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
+        if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
+            continue
+        }
+        elementText = "";
+        if (element.tagName.toLowerCase() === "input") {
+            elementText = element.value.toLowerCase().trim();
+        } else {
+            elementText = getElementBaseText(element);
+        }
 
-            if (elementText.length) {
-                // and these mean its out of stock
-                for (const outOfStockText of outOfStockTexts) {
-                    if (elementText.includes(outOfStockText)) {
-                        return outOfStockText; // item is out of stock
-                    }
+        if (elementText.length) {
+            // and these mean its out of stock
+            for (const outOfStockText of outOfStockTexts) {
+                if (elementText.includes(outOfStockText)) {
+                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
+                    return outOfStockText; // item is out of stock
                 }
             }
         }
     }
 
+    console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
     return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
 }
 
 // returns the element text that makes it think it's out of stock
 return isItemInStock().trim()
 
+
diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
index efe593d0..326889ea 100644
--- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js
+++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -16,24 +16,23 @@ try {
 }
 
 
-
 // Include the getXpath script directly, easier than fetching
 function getxpath(e) {
-        var n = e;
-        if (n && n.id) return '//*[@id="' + n.id + '"]';
-        for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
-            for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
-            for (d = n.nextSibling; d;) {
-                if (d.nodeName === n.nodeName) {
-                    r = !0;
-                    break
-                }
-                d = d.nextSibling
+    var n = e;
+    if (n && n.id) return '//*[@id="' + n.id + '"]';
+    for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
+        for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
+        for (d = n.nextSibling; d;) {
+            if (d.nodeName === n.nodeName) {
+                r = !0;
+                break
             }
-            o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
+            d = d.nextSibling
         }
-        return o.length ? "/" + o.reverse().join("/") : ""
+        o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
     }
+    return o.length ? "/" + o.reverse().join("/") : ""
+}
 
 const findUpTag = (el) => {
     let r = el
@@ -59,14 +58,14 @@ const findUpTag = (el) => {
 
     // Strategy 2: Keep going up until we hit an ID tag, imagine it's like  #list-widget div h4
     while (r.parentNode) {
-        if (depth == 5) {
+        if (depth === 5) {
             break;
         }
         if ('' !== r.id) {
             chained_css.unshift("#" + CSS.escape(r.id));
             final_selector = chained_css.join(' > ');
             // Be sure theres only one, some sites have multiples of the same ID tag :-(
-            if (window.document.querySelectorAll(final_selector).length == 1) {
+            if (window.document.querySelectorAll(final_selector).length === 1) {
                 return final_selector;
             }
             return null;
@@ -82,30 +81,60 @@ const findUpTag = (el) => {
 
 // @todo - if it's SVG or IMG, go into image diff mode
 // %ELEMENTS% replaced at injection time because different interfaces use it with different settings
-var elements = window.document.querySelectorAll("%ELEMENTS%");
+
 var size_pos = [];
 // after page fetch, inject this JS
 // build a map of all elements and their positions (maybe that only include text?)
 var bbox;
-for (var i = 0; i < elements.length; i++) {
-    bbox = elements[i].getBoundingClientRect();
+console.log("Scanning %ELEMENTS%");
+
+function collectVisibleElements(parent, visibleElements) {
+    if (!parent) return; // Base case: if parent is null or undefined, return
+
 
-    // Exclude items that are not interactable or visible
-    if(elements[i].style.opacity === "0") {
-        continue
+    // Add the parent itself to the visible elements array if it's of the specified types
+    const tagName = parent.tagName.toLowerCase();
+    if ("%ELEMENTS%".split(',').includes(tagName)) {
+        visibleElements.push(parent);
     }
-    if(elements[i].style.display === "none" || elements[i].style.pointerEvents === "none" ) {
-        continue
+
+    // Iterate over the parent's children
+    const children = parent.children;
+    for (let i = 0; i < children.length; i++) {
+        const child = children[i];
+        if (
+            child.nodeType === Node.ELEMENT_NODE &&
+            window.getComputedStyle(child).display !== 'none' &&
+            window.getComputedStyle(child).visibility !== 'hidden' &&
+            child.offsetWidth >= 0 &&
+            child.offsetHeight >= 0 &&
+            window.getComputedStyle(child).contentVisibility !== 'hidden'
+        ) {
+            // If the child is an element and is visible, recursively collect visible elements
+            collectVisibleElements(child, visibleElements);
+        }
     }
+}
+
+// Create an array to hold the visible elements
+const visibleElementsArray = [];
+
+// Call collectVisibleElements with the starting parent element
+collectVisibleElements(document.body, visibleElementsArray);
+
+
+visibleElementsArray.forEach(function (element) {
+
+    bbox = element.getBoundingClientRect();
 
     // Skip really small ones, and where width or height ==0
-    if (bbox['width'] * bbox['height'] < 100) {
-        continue;
+    if (bbox['width'] * bbox['height'] < 10) {
+        return
     }
 
     // Don't include elements that are offset from canvas
-    if (bbox['top']+scroll_y < 0 || bbox['left'] < 0) {
-        continue;
+    if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
+        return
     }
 
     // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
@@ -114,46 +143,41 @@ for (var i = 0; i < elements.length; i++) {
 
     // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
     xpath_result = false;
-
     try {
-        var d = findUpTag(elements[i]);
+        var d = findUpTag(element);
         if (d) {
             xpath_result = d;
         }
     } catch (e) {
         console.log(e);
     }
-
     // You could swap it and default to getXpath and then try the smarter one
     // default back to the less intelligent one
     if (!xpath_result) {
         try {
             // I've seen on FB and eBay that this doesnt work
             // ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
-            xpath_result = getxpath(elements[i]);
+            xpath_result = getxpath(element);
         } catch (e) {
             console.log(e);
-            continue;
+            return
         }
     }
 
-    if (window.getComputedStyle(elements[i]).visibility === "hidden") {
-        continue;
-    }
 
-    // @todo Possible to ONLY list where it's clickable to save JSON xfer size
     size_pos.push({
         xpath: xpath_result,
         width: Math.round(bbox['width']),
         height: Math.round(bbox['height']),
         left: Math.floor(bbox['left']),
-        top: Math.floor(bbox['top'])+scroll_y,
-        tagName: (elements[i].tagName) ? elements[i].tagName.toLowerCase() : '',
-        tagtype: (elements[i].tagName == 'INPUT' && elements[i].type) ? elements[i].type.toLowerCase() : '',
-        isClickable: (elements[i].onclick) || window.getComputedStyle(elements[i]).cursor == "pointer"
+        top: Math.floor(bbox['top']) + scroll_y,
+        tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
+        tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
+        isClickable: window.getComputedStyle(element).cursor == "pointer"
     });
 
-}
+});
+
 
 // Inject the current one set in the include_filters, which may be a CSS rule
 // used for displaying the current one in VisualSelector, where its not one we generated.
@@ -180,7 +204,7 @@ if (include_filters.length) {
             }
         } catch (e) {
             // Maybe catch DOMException and alert?
-            console.log("xpath_element_scraper: Exception selecting element from filter "+f);
+            console.log("xpath_element_scraper: Exception selecting element from filter " + f);
             console.log(e);
         }
 
@@ -210,8 +234,8 @@ if (include_filters.length) {
                 }
             }
         }
-        
-        if(!q) {
+
+        if (!q) {
             console.log("xpath_element_scraper: filter element " + f + " was not found");
         }
 
@@ -221,7 +245,7 @@ if (include_filters.length) {
                 width: parseInt(bbox['width']),
                 height: parseInt(bbox['height']),
                 left: parseInt(bbox['left']),
-                top: parseInt(bbox['top'])+scroll_y
+                top: parseInt(bbox['top']) + scroll_y
             });
         }
     }
@@ -229,7 +253,7 @@ if (include_filters.length) {
 
 // Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
 // so that we dont select the wrapping element by mistake and be unable to select what we want
-size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1)
+size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
 
 // Window.width required for proper scaling in the frontend
 return {'size_pos': size_pos, 'browser_width': window.innerWidth};