From 7071df061ad06048b4b0c5522e4d54171a7aa371 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 22 Aug 2024 15:01:36 +0200 Subject: [PATCH] Price detection/scraping - Adding extra element training data (#2582) --- .../res/xpath_element_scraper.js | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index 87c0df70..ccd89436 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -164,6 +164,15 @@ visibleElementsArray.forEach(function (element) { } } + let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now + + let text = element.textContent.trim().slice(0, 30).trim(); + while (/\n{2,}|\t{2,}/.test(text)) { + text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t') + } + + // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. + const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; size_pos.push({ xpath: xpath_result, @@ -171,9 +180,16 @@ visibleElementsArray.forEach(function (element) { height: Math.round(bbox['height']), left: Math.floor(bbox['left']), top: Math.floor(bbox['top']) + scroll_y, + // tagName used by Browser Steps tagName: (element.tagName) ? element.tagName.toLowerCase() : '', + // tagtype used by Browser Steps tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', - isClickable: window.getComputedStyle(element).cursor == "pointer" + isClickable: window.getComputedStyle(element).cursor === "pointer", + // Used by the keras trainer + fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), + fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), + hasDigitCurrency: hasDigitCurrency, + label: label, }); });