Merge branch 'master' into price-scraper-ML-integration

4 months ago · f967893bd8
parent 392cc4586f 5b70625eaa
commit f967893bd8
4 changed files with 18 additions and 12 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -2,7 +2,7 @@
 # Read more https://github.com/dgtlmoon/changedetection.io/wiki
-__version__ = '0.46.03'
+__version__ = '0.46.04'
 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js
+++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
@ -75,6 +75,7 @@ function isItemInStock() {
        'vergriffen',
        'vorbestellen',
        'vorbestellung ist bald möglich',
        'we don\'t currently have any',
        'we couldn\'t find any products that match',
        'we do not currently have an estimate of when this product will be back in stock.',
        'we don\'t know when or if this item will be back in stock.',
@ -173,7 +174,8 @@ function isItemInStock() {
        const element = elementsToScan[i];
        // outside the 'fold' or some weird text in the heading area
        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
-        if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
+        // Note: theres also an automated test that places the 'out of stock' text fairly low down
        if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
            continue
        }
        elementText = "";
@ -187,7 +189,7 @@ function isItemInStock() {
            // and these mean its out of stock
            for (const outOfStockText of outOfStockTexts) {
                if (elementText.includes(outOfStockText)) {
-                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
+                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
                    return outOfStockText; // item is out of stock
                }
            }
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@ -44,13 +44,16 @@ def get_itemprop_availability(html_content) -> Restock:
    import extruct
    logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
    value = {}
    now = time.time()
    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
-
+    try:
        data = extruct.extract(html_content, syntaxes=syntaxes)
    except Exception as e:
        logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
        return Restock()
    logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
    # First phase, dead simple scanning of anything that looks useful
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -18,7 +18,7 @@ services:
  #
  #        Log levels are in descending order. (TRACE is the most detailed one)
  #        Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
-  #      - LOGGER_LEVEL=DEBUG
+  #      - LOGGER_LEVEL=TRACE
  #
  #       Alternative WebDriver/selenium URL, do not use "'s or 's!
  #      - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@ -29,8 +29,9 @@ services:
  #
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
-  #       Alternative Playwright URL, do not use "'s or 's!
+  #       Alternative target "Chrome" Playwright URL, do not use "'s or 's!
-  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
+  #       "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
  #      - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #
@ -77,10 +78,10 @@ services:
 #              condition: service_started
-     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
+     # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
     # RECOMMENDED FOR FETCHING PAGES WITH CHROME
-#    playwright-chrome:
+#    sockpuppetbrowser:
-#        hostname: playwright-chrome
+#        hostname: sockpuppetbrowser
 #        image: dgtlmoon/sockpuppetbrowser:latest
 #        cap_add:
 #            - SYS_ADMIN