From 4225900ec3ae38651c7db24a8f18e297936a87f2 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 1 Sep 2024 12:47:21 +0200 Subject: [PATCH 1/5] Restock - updating texts and text offsets --- changedetectionio/content_fetchers/res/stock-not-in-stock.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 94c6350d..6958a4d5 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -75,6 +75,7 @@ function isItemInStock() { 'vergriffen', 'vorbestellen', 'vorbestellung ist bald möglich', + 'we don\'t currently have any', 'we couldn\'t find any products that match', 'we do not currently have an estimate of when this product will be back in stock.', 'we don\'t know when or if this item will be back in stock.', @@ -173,7 +174,7 @@ function isItemInStock() { const element = elementsToScan[i]; // outside the 'fold' or some weird text in the heading area // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 200) { continue } elementText = ""; @@ -187,7 +188,7 @@ function isItemInStock() { // and these mean its out of stock for (const outOfStockText of outOfStockTexts) { if (elementText.includes(outOfStockText)) { - console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) + console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) return outOfStockText; // item is out of stock } } From 55fe2abf42e819a6292d32021348830ba4b612a3 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 1 Sep 2024 13:07:06 +0200 Subject: [PATCH 2/5] Restock/Price detection - Better catching of errors when parsing metadata documents for restock/price check (#2602) --- changedetectionio/processors/restock_diff/processor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index b2184e35..1a3a96ca 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -40,13 +40,16 @@ def get_itemprop_availability(html_content) -> Restock: import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") - value = {} now = time.time() - # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. + # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] + try: + data = extruct.extract(html_content, syntaxes=syntaxes) + except Exception as e: + logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}") + return Restock() - data = extruct.extract(html_content, syntaxes=syntaxes) logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") # First phase, dead simple scanning of anything that looks useful From 1cb38347daac33d5a1e20d76b4b14ddf2876097a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 2 Sep 2024 13:20:44 +0200 Subject: [PATCH 3/5] Container name should be 'sockpuppetbrowser' because its not just playwright that uses it --- docker-compose.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 2480a339..108ea093 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: # # Log levels are in descending order. (TRACE is the most detailed one) # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL - # - LOGGER_LEVEL=DEBUG + # - LOGGER_LEVEL=TRACE # # Alternative WebDriver/selenium URL, do not use "'s or 's! # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub @@ -29,8 +29,9 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # - # Alternative Playwright URL, do not use "'s or 's! - # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 + # Alternative target "Chrome" Playwright URL, do not use "'s or 's! + # "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser. + # - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 # # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # @@ -73,10 +74,10 @@ services: # condition: service_started - # Used for fetching pages via Playwright+Chrome where you need Javascript support. + # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages. # RECOMMENDED FOR FETCHING PAGES WITH CHROME -# playwright-chrome: -# hostname: playwright-chrome +# sockpuppetbrowser: +# hostname: sockpuppetbrowser # image: dgtlmoon/sockpuppetbrowser:latest # cap_add: # - SYS_ADMIN From 60d292107d9599104e79a890374affd8c2e3af3f Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 2 Sep 2024 15:11:31 +0200 Subject: [PATCH 4/5] Fixing restock monitor tests and tweaking docker default config example, --- changedetectionio/content_fetchers/res/stock-not-in-stock.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 6958a4d5..df33fbe6 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -174,7 +174,8 @@ function isItemInStock() { const element = elementsToScan[i]; // outside the 'fold' or some weird text in the heading area // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 200) { + // Note: theres also an automated test that places the 'out of stock' text fairly low down + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) { continue } elementText = ""; From 5b70625eaabba8ed85bb271f32c6c955513e1ef2 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 4 Sep 2024 13:55:18 +0200 Subject: [PATCH 5/5] 0.46.04 --- changedetectionio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 1872a520..ca262be7 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -2,7 +2,7 @@ # Read more https://github.com/dgtlmoon/changedetection.io/wiki -__version__ = '0.46.03' +__version__ = '0.46.04' from changedetectionio.strtobool import strtobool from json.decoder import JSONDecodeError