Fetching - restock detecting and visual selector scraper - Fixes scraping of elements that are not visible

pull/2185/head^2
dgtlmoon 3 months ago committed by GitHub
parent 3d390b6ea4
commit a62043e086
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -10,7 +10,7 @@ function isItemInStock() {
const outOfStockTexts = [ const outOfStockTexts = [
' أخبرني عندما يتوفر', ' أخبرني عندما يتوفر',
'0 in stock', '0 in stock',
'actuellement indisponible', 'actuellement indisponible',
'agotado', 'agotado',
'article épuisé', 'article épuisé',
'artikel zurzeit vergriffen', 'artikel zurzeit vergriffen',
@ -45,9 +45,9 @@ function isItemInStock() {
'no tickets available', 'no tickets available',
'not available', 'not available',
'not currently available', 'not currently available',
'not in stock', 'not in stock',
'notify me when available', 'notify me when available',
'notify when available', 'notify when available',
'não estamos a aceitar encomendas', 'não estamos a aceitar encomendas',
'out of stock', 'out of stock',
'out-of-stock', 'out-of-stock',
@ -70,7 +70,9 @@ function isItemInStock() {
'품절' '품절'
]; ];
const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
function getElementBaseText(element) { function getElementBaseText(element) {
// .textContent can include text from children which may give the wrong results // .textContent can include text from children which may give the wrong results
// scan only immediate TEXT_NODEs, which will be a child of the element // scan only immediate TEXT_NODEs, which will be a child of the element
@ -81,29 +83,69 @@ function isItemInStock() {
return text.toLowerCase().trim(); return text.toLowerCase().trim();
} }
const negateOutOfStockRegex = new RegExp('([0-9] in stock|add to cart)', 'ig'); const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
// The out-of-stock or in-stock-text is generally always above-the-fold // The out-of-stock or in-stock-text is generally always above-the-fold
// and often below-the-fold is a list of related products that may or may not contain trigger text // and often below-the-fold is a list of related products that may or may not contain trigger text
// so it's good to filter to just the 'above the fold' elements // so it's good to filter to just the 'above the fold' elements
// and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist // and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
const elementsToScan = Array.from(document.getElementsByTagName('*')).filter(element => element.getBoundingClientRect().top + window.scrollY <= vh && element.getBoundingClientRect().top + window.scrollY >= 100);
// @todo - if it's SVG or IMG, go into image diff mode
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
console.log("Scanning %ELEMENTS%");
function collectVisibleElements(parent, visibleElements) {
if (!parent) return; // Base case: if parent is null or undefined, return
// Add the parent itself to the visible elements array if it's of the specified types
visibleElements.push(parent);
// Iterate over the parent's children
const children = parent.children;
for (let i = 0; i < children.length; i++) {
const child = children[i];
if (
child.nodeType === Node.ELEMENT_NODE &&
window.getComputedStyle(child).display !== 'none' &&
window.getComputedStyle(child).visibility !== 'hidden' &&
child.offsetWidth >= 0 &&
child.offsetHeight >= 0 &&
window.getComputedStyle(child).contentVisibility !== 'hidden'
) {
// If the child is an element and is visible, recursively collect visible elements
collectVisibleElements(child, visibleElements);
}
}
}
const elementsToScan = [];
collectVisibleElements(document.body, elementsToScan);
var elementText = ""; var elementText = "";
// REGEXS THAT REALLY MEAN IT'S IN STOCK // REGEXS THAT REALLY MEAN IT'S IN STOCK
for (let i = elementsToScan.length - 1; i >= 0; i--) { for (let i = elementsToScan.length - 1; i >= 0; i--) {
const element = elementsToScan[i]; const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue
}
elementText = ""; elementText = "";
if (element.tagName.toLowerCase() === "input") { if (element.tagName.toLowerCase() === "input") {
elementText = element.value.toLowerCase(); elementText = element.value.toLowerCase().trim();
} else { } else {
elementText = getElementBaseText(element); elementText = getElementBaseText(element);
} }
if (elementText.length) { if (elementText.length) {
// try which ones could mean its in stock // try which ones could mean its in stock
if (negateOutOfStockRegex.test(elementText)) { if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
return 'Possibly in stock'; return 'Possibly in stock';
} }
} }
@ -112,28 +154,34 @@ function isItemInStock() {
// OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK // OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
for (let i = elementsToScan.length - 1; i >= 0; i--) { for (let i = elementsToScan.length - 1; i >= 0; i--) {
const element = elementsToScan[i]; const element = elementsToScan[i];
if (element.offsetWidth > 0 || element.offsetHeight > 0 || element.getClientRects().length > 0) { // outside the 'fold' or some weird text in the heading area
elementText = ""; // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
if (element.tagName.toLowerCase() === "input") { if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
elementText = element.value.toLowerCase(); continue
} else { }
elementText = getElementBaseText(element); elementText = "";
} if (element.tagName.toLowerCase() === "input") {
elementText = element.value.toLowerCase().trim();
} else {
elementText = getElementBaseText(element);
}
if (elementText.length) { if (elementText.length) {
// and these mean its out of stock // and these mean its out of stock
for (const outOfStockText of outOfStockTexts) { for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) { if (elementText.includes(outOfStockText)) {
return outOfStockText; // item is out of stock console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
} return outOfStockText; // item is out of stock
} }
} }
} }
} }
console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
return 'Possibly in stock'; // possibly in stock, cant decide otherwise. return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
} }
// returns the element text that makes it think it's out of stock // returns the element text that makes it think it's out of stock
return isItemInStock().trim() return isItemInStock().trim()

@ -16,24 +16,23 @@ try {
} }
// Include the getXpath script directly, easier than fetching // Include the getXpath script directly, easier than fetching
function getxpath(e) { function getxpath(e) {
var n = e; var n = e;
if (n && n.id) return '//*[@id="' + n.id + '"]'; if (n && n.id) return '//*[@id="' + n.id + '"]';
for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) { for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling; for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
for (d = n.nextSibling; d;) { for (d = n.nextSibling; d;) {
if (d.nodeName === n.nodeName) { if (d.nodeName === n.nodeName) {
r = !0; r = !0;
break break
}
d = d.nextSibling
} }
o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode d = d.nextSibling
} }
return o.length ? "/" + o.reverse().join("/") : "" o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
} }
return o.length ? "/" + o.reverse().join("/") : ""
}
const findUpTag = (el) => { const findUpTag = (el) => {
let r = el let r = el
@ -59,14 +58,14 @@ const findUpTag = (el) => {
// Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4 // Strategy 2: Keep going up until we hit an ID tag, imagine it's like #list-widget div h4
while (r.parentNode) { while (r.parentNode) {
if (depth == 5) { if (depth === 5) {
break; break;
} }
if ('' !== r.id) { if ('' !== r.id) {
chained_css.unshift("#" + CSS.escape(r.id)); chained_css.unshift("#" + CSS.escape(r.id));
final_selector = chained_css.join(' > '); final_selector = chained_css.join(' > ');
// Be sure theres only one, some sites have multiples of the same ID tag :-( // Be sure theres only one, some sites have multiples of the same ID tag :-(
if (window.document.querySelectorAll(final_selector).length == 1) { if (window.document.querySelectorAll(final_selector).length === 1) {
return final_selector; return final_selector;
} }
return null; return null;
@ -82,30 +81,60 @@ const findUpTag = (el) => {
// @todo - if it's SVG or IMG, go into image diff mode // @todo - if it's SVG or IMG, go into image diff mode
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings // %ELEMENTS% replaced at injection time because different interfaces use it with different settings
var elements = window.document.querySelectorAll("%ELEMENTS%");
var size_pos = []; var size_pos = [];
// after page fetch, inject this JS // after page fetch, inject this JS
// build a map of all elements and their positions (maybe that only include text?) // build a map of all elements and their positions (maybe that only include text?)
var bbox; var bbox;
for (var i = 0; i < elements.length; i++) { console.log("Scanning %ELEMENTS%");
bbox = elements[i].getBoundingClientRect();
function collectVisibleElements(parent, visibleElements) {
if (!parent) return; // Base case: if parent is null or undefined, return
// Exclude items that are not interactable or visible // Add the parent itself to the visible elements array if it's of the specified types
if(elements[i].style.opacity === "0") { const tagName = parent.tagName.toLowerCase();
continue if ("%ELEMENTS%".split(',').includes(tagName)) {
visibleElements.push(parent);
} }
if(elements[i].style.display === "none" || elements[i].style.pointerEvents === "none" ) {
continue // Iterate over the parent's children
const children = parent.children;
for (let i = 0; i < children.length; i++) {
const child = children[i];
if (
child.nodeType === Node.ELEMENT_NODE &&
window.getComputedStyle(child).display !== 'none' &&
window.getComputedStyle(child).visibility !== 'hidden' &&
child.offsetWidth >= 0 &&
child.offsetHeight >= 0 &&
window.getComputedStyle(child).contentVisibility !== 'hidden'
) {
// If the child is an element and is visible, recursively collect visible elements
collectVisibleElements(child, visibleElements);
}
} }
}
// Create an array to hold the visible elements
const visibleElementsArray = [];
// Call collectVisibleElements with the starting parent element
collectVisibleElements(document.body, visibleElementsArray);
visibleElementsArray.forEach(function (element) {
bbox = element.getBoundingClientRect();
// Skip really small ones, and where width or height ==0 // Skip really small ones, and where width or height ==0
if (bbox['width'] * bbox['height'] < 100) { if (bbox['width'] * bbox['height'] < 10) {
continue; return
} }
// Don't include elements that are offset from canvas // Don't include elements that are offset from canvas
if (bbox['top']+scroll_y < 0 || bbox['left'] < 0) { if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
continue; return
} }
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
@ -114,46 +143,41 @@ for (var i = 0; i < elements.length; i++) {
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
xpath_result = false; xpath_result = false;
try { try {
var d = findUpTag(elements[i]); var d = findUpTag(element);
if (d) { if (d) {
xpath_result = d; xpath_result = d;
} }
} catch (e) { } catch (e) {
console.log(e); console.log(e);
} }
// You could swap it and default to getXpath and then try the smarter one // You could swap it and default to getXpath and then try the smarter one
// default back to the less intelligent one // default back to the less intelligent one
if (!xpath_result) { if (!xpath_result) {
try { try {
// I've seen on FB and eBay that this doesnt work // I've seen on FB and eBay that this doesnt work
// ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44) // ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
xpath_result = getxpath(elements[i]); xpath_result = getxpath(element);
} catch (e) { } catch (e) {
console.log(e); console.log(e);
continue; return
} }
} }
if (window.getComputedStyle(elements[i]).visibility === "hidden") {
continue;
}
// @todo Possible to ONLY list where it's clickable to save JSON xfer size
size_pos.push({ size_pos.push({
xpath: xpath_result, xpath: xpath_result,
width: Math.round(bbox['width']), width: Math.round(bbox['width']),
height: Math.round(bbox['height']), height: Math.round(bbox['height']),
left: Math.floor(bbox['left']), left: Math.floor(bbox['left']),
top: Math.floor(bbox['top'])+scroll_y, top: Math.floor(bbox['top']) + scroll_y,
tagName: (elements[i].tagName) ? elements[i].tagName.toLowerCase() : '', tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
tagtype: (elements[i].tagName == 'INPUT' && elements[i].type) ? elements[i].type.toLowerCase() : '', tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: (elements[i].onclick) || window.getComputedStyle(elements[i]).cursor == "pointer" isClickable: window.getComputedStyle(element).cursor == "pointer"
}); });
} });
// Inject the current one set in the include_filters, which may be a CSS rule // Inject the current one set in the include_filters, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated. // used for displaying the current one in VisualSelector, where its not one we generated.
@ -180,7 +204,7 @@ if (include_filters.length) {
} }
} catch (e) { } catch (e) {
// Maybe catch DOMException and alert? // Maybe catch DOMException and alert?
console.log("xpath_element_scraper: Exception selecting element from filter "+f); console.log("xpath_element_scraper: Exception selecting element from filter " + f);
console.log(e); console.log(e);
} }
@ -210,8 +234,8 @@ if (include_filters.length) {
} }
} }
} }
if(!q) { if (!q) {
console.log("xpath_element_scraper: filter element " + f + " was not found"); console.log("xpath_element_scraper: filter element " + f + " was not found");
} }
@ -221,7 +245,7 @@ if (include_filters.length) {
width: parseInt(bbox['width']), width: parseInt(bbox['width']),
height: parseInt(bbox['height']), height: parseInt(bbox['height']),
left: parseInt(bbox['left']), left: parseInt(bbox['left']),
top: parseInt(bbox['top'])+scroll_y top: parseInt(bbox['top']) + scroll_y
}); });
} }
} }
@ -229,7 +253,7 @@ if (include_filters.length) {
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area // Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
// so that we dont select the wrapping element by mistake and be unable to select what we want // so that we dont select the wrapping element by mistake and be unable to select what we want
size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1) size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
// Window.width required for proper scaling in the frontend // Window.width required for proper scaling in the frontend
return {'size_pos': size_pos, 'browser_width': window.innerWidth}; return {'size_pos': size_pos, 'browser_width': window.innerWidth};

Loading…
Cancel
Save