Merge branch 'master' into price-scraper-ML-integration

price-scraper-ML-integration
dgtlmoon 4 months ago
commit f967893bd8

@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki # Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.46.03' __version__ = '0.46.04'
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError

@ -75,6 +75,7 @@ function isItemInStock() {
'vergriffen', 'vergriffen',
'vorbestellen', 'vorbestellen',
'vorbestellung ist bald möglich', 'vorbestellung ist bald möglich',
'we don\'t currently have any',
'we couldn\'t find any products that match', 'we couldn\'t find any products that match',
'we do not currently have an estimate of when this product will be back in stock.', 'we do not currently have an estimate of when this product will be back in stock.',
'we don\'t know when or if this item will be back in stock.', 'we don\'t know when or if this item will be back in stock.',
@ -173,7 +174,8 @@ function isItemInStock() {
const element = elementsToScan[i]; const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area // outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) { // Note: theres also an automated test that places the 'out of stock' text fairly low down
if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue continue
} }
elementText = ""; elementText = "";
@ -187,7 +189,7 @@ function isItemInStock() {
// and these mean its out of stock // and these mean its out of stock
for (const outOfStockText of outOfStockTexts) { for (const outOfStockText of outOfStockTexts) {
if (elementText.includes(outOfStockText)) { if (elementText.includes(outOfStockText)) {
console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`) console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
return outOfStockText; // item is out of stock return outOfStockText; // item is out of stock
} }
} }

@ -44,13 +44,16 @@ def get_itemprop_availability(html_content) -> Restock:
import extruct import extruct
logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")
value = {}
now = time.time() now = time.time()
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
# Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
try:
data = extruct.extract(html_content, syntaxes=syntaxes)
except Exception as e:
logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
return Restock()
data = extruct.extract(html_content, syntaxes=syntaxes)
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
# First phase, dead simple scanning of anything that looks useful # First phase, dead simple scanning of anything that looks useful

@ -18,7 +18,7 @@ services:
# #
# Log levels are in descending order. (TRACE is the most detailed one) # Log levels are in descending order. (TRACE is the most detailed one)
# Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL # Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
# - LOGGER_LEVEL=DEBUG # - LOGGER_LEVEL=TRACE
# #
# Alternative WebDriver/selenium URL, do not use "'s or 's! # Alternative WebDriver/selenium URL, do not use "'s or 's!
# - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@ -29,8 +29,9 @@ services:
# #
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
# #
# Alternative Playwright URL, do not use "'s or 's! # Alternative target "Chrome" Playwright URL, do not use "'s or 's!
# - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 # "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
# - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
# #
# Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
# #
@ -77,10 +78,10 @@ services:
# condition: service_started # condition: service_started
# Used for fetching pages via Playwright+Chrome where you need Javascript support. # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
# RECOMMENDED FOR FETCHING PAGES WITH CHROME # RECOMMENDED FOR FETCHING PAGES WITH CHROME
# playwright-chrome: # sockpuppetbrowser:
# hostname: playwright-chrome # hostname: sockpuppetbrowser
# image: dgtlmoon/sockpuppetbrowser:latest # image: dgtlmoon/sockpuppetbrowser:latest
# cap_add: # cap_add:
# - SYS_ADMIN # - SYS_ADMIN

Loading…
Cancel
Save