diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 1872a520..ca262be7 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -2,7 +2,7 @@ # Read more https://github.com/dgtlmoon/changedetection.io/wiki -__version__ = '0.46.03' +__version__ = '0.46.04' from changedetectionio.strtobool import strtobool from json.decoder import JSONDecodeError diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js index 94c6350d..df33fbe6 100644 --- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js +++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js @@ -75,6 +75,7 @@ function isItemInStock() { 'vergriffen', 'vorbestellen', 'vorbestellung ist bald möglich', + 'we don\'t currently have any', 'we couldn\'t find any products that match', 'we do not currently have an estimate of when this product will be back in stock.', 'we don\'t know when or if this item will be back in stock.', @@ -173,7 +174,8 @@ function isItemInStock() { const element = elementsToScan[i]; // outside the 'fold' or some weird text in the heading area // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden - if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { + // Note: theres also an automated test that places the 'out of stock' text fairly low down + if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) { continue } elementText = ""; @@ -187,7 +189,7 @@ function isItemInStock() { // and these mean its out of stock for (const outOfStockText of outOfStockTexts) { if (elementText.includes(outOfStockText)) { - console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) + console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`) return outOfStockText; // item is out of stock } } diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 6b7ef5db..90344ffd 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -44,13 +44,16 @@ def get_itemprop_availability(html_content) -> Restock: import extruct logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") - value = {} now = time.time() - # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. + # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] + try: + data = extruct.extract(html_content, syntaxes=syntaxes) + except Exception as e: + logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}") + return Restock() - data = extruct.extract(html_content, syntaxes=syntaxes) logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") # First phase, dead simple scanning of anything that looks useful diff --git a/docker-compose.yml b/docker-compose.yml index b3441ac3..1c6b1732 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: # # Log levels are in descending order. (TRACE is the most detailed one) # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL - # - LOGGER_LEVEL=DEBUG + # - LOGGER_LEVEL=TRACE # # Alternative WebDriver/selenium URL, do not use "'s or 's! # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub @@ -29,8 +29,9 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # - # Alternative Playwright URL, do not use "'s or 's! - # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 + # Alternative target "Chrome" Playwright URL, do not use "'s or 's! + # "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser. + # - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 # # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # @@ -77,10 +78,10 @@ services: # condition: service_started - # Used for fetching pages via Playwright+Chrome where you need Javascript support. + # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages. # RECOMMENDED FOR FETCHING PAGES WITH CHROME -# playwright-chrome: -# hostname: playwright-chrome +# sockpuppetbrowser: +# hostname: sockpuppetbrowser # image: dgtlmoon/sockpuppetbrowser:latest # cap_add: # - SYS_ADMIN